Rdatatable · mattdowle · Jan 12, 2018 · Oct 19, 2017 · Nov 10, 2017 · Nov 14, 2017
@@ -59,6 +59,7 @@
 
 12. Faster `as.IDate` and `as.ITime` methods for `POSIXct` and `numeric`, [#1392](https://github.com/Rdatatable/data.table/issues/1392). Thanks to Jan Gorecki for the PR.
 
+13. `unique(DT)` now returns `DT` early when there are no duplicates to save RAM, [#2013](https://github.com/Rdatatable/data.table/issues/2013). Thanks to Michael Chirico for the PR. 
 
 #### BUG FIXES
 

@@ -811,7 +811,7 @@ chmatch2 <- function(x, table, nomatch=NA_integer_) {
       if (is.character(j)) {
         if (notj) {
           w = chmatch(j, names(x))
-          if (any(is.na(w))) {
+          if (anyNA(w)) {
             warning("column(s) not removed because not found: ",paste(j[is.na(w)],collapse=","))
             w = w[!is.na(w)]
           }
@@ -1036,12 +1036,12 @@ chmatch2 <- function(x, table, nomatch=NA_integer_) {
           } else if (is.numeric(.SDcols)) {
             # if .SDcols is numeric, use 'dupdiff' instead of 'setdiff'
             if (length(unique(sign(.SDcols))) != 1L) stop(".SDcols is numeric but has both +ve and -ve indices")
-            if (any(is.na(.SDcols)) || any(abs(.SDcols)>ncol(x)) || any(abs(.SDcols)<1L)) stop(".SDcols is numeric but out of bounds (or NA)")
+            if (anyNA(.SDcols) || any(abs(.SDcols)>ncol(x)) || any(abs(.SDcols)<1L)) stop(".SDcols is numeric but out of bounds (or NA)")
             if (colm) ansvars = dupdiff(names(x)[-.SDcols], bynames) else ansvars = names(x)[.SDcols]
             ansvals = if (colm) setdiff(seq_along(names(x)), c(as.integer(.SDcols), which(names(x) %chin% bynames))) else as.integer(.SDcols)
           } else {
             if (!is.character(.SDcols)) stop(".SDcols should be column numbers or names")
-            if (any(is.na(.SDcols)) || any(!.SDcols %chin% names(x))) stop("Some items of .SDcols are not column names (or are NA)")
+            if (anyNA(.SDcols) || any(!.SDcols %chin% names(x))) stop("Some items of .SDcols are not column names (or are NA)")
             if (colm) ansvars = setdiff(setdiff(names(x), .SDcols), bynames) else ansvars = .SDcols
             # dups = FALSE here. DT[, .SD, .SDcols=c("x", "x")] again doesn't really help with which 'x' to keep (and if '-' which x to remove)
             ansvals = chmatch(ansvars, names(x))
@@ -1979,7 +1979,7 @@ tail.data.table <- function(x, n=6, ...) {
   } else i = NULL          # meaning (to C code) all rows, without allocating 1L:nrow(x) vector
   if (missing(j)) j=names(x)
   if (!is.atomic(j)) stop("j must be atomic vector, see ?is.atomic")
-  if (any(is.na(j))) stop("NA in j")
+  if (anyNA(j)) stop("NA in j")
   if (is.character(j)) {
     newnames = setdiff(j,names(x))
     cols = as.integer(chmatch(j, c(names(x),newnames)))
@@ -2197,6 +2197,7 @@ is_na <- function(x, by=seq_along(x)) .Call(Cdt_na, x, by)
 any_na <- function(x, by=seq_along(x)) .Call(CanyNA, x, by)
 
 na.omit.data.table <- function (object, cols = seq_along(object), invert = FALSE, ...) {
+  # compare to stats:::na.omit.data.frame
   if (!cedta()) return(NextMethod())
   if ( !missing(invert) && is.na(as.logical(invert)) )
     stop("Argument 'invert' must be logical TRUE/FALSE")
@@ -2209,9 +2210,10 @@ na.omit.data.table <- function (object, cols = seq_along(object), invert = FALSE
   }
   cols = as.integer(cols)
   ix = .Call(Cdt_na, object, cols)
-  ans = .Call(CsubsetDT, object, which_(ix, bool = invert), seq_along(object))
-  if (any(ix)) setindexv(ans, NULL)[] else ans #1734
-  # compare the above to stats:::na.omit.data.frame
+  if (any(ix))
+    .Call(CsubsetDT, object, which_(ix, bool = invert), seq_along(object))
+  else
+    object
 }
 
 which_ <- function(x, bool = TRUE) {
@@ -2479,7 +2481,7 @@ setnames <- function(x,old,new) {
       if (!is.character(old)) stop("'old' is type ",typeof(old)," but should be integer, double or character")
       if (any(duplicated(old))) stop("Some duplicates exist in 'old': ", paste(old[duplicated(old)],collapse=","))
       i = chmatch(old,names(x))
-      if (any(is.na(i))) stop("Items of 'old' not found in column names: ",paste(old[is.na(i)],collapse=","))
+      if (anyNA(i)) stop("Items of 'old' not found in column names: ",paste(old[is.na(i)],collapse=","))
       if (any(tt<-!is.na(chmatch(old,names(x)[-i])))) stop("Some items of 'old' are duplicated (ambiguous) in column names: ",paste(old[tt],collapse=","))
     }
     if (length(new)!=length(i)) stop("'old' is length ",length(i)," but 'new' is length ",length(new))
@@ -2524,7 +2526,7 @@ setcolorder <- function(x, neworder)
   if (is.character(neworder)) {
     if (any(duplicated(names(x)))) stop("x has some duplicated column name(s): ", paste(names(x)[duplicated(names(x))], collapse=","), ". Please remove or rename the duplicate(s) and try again.")
     o = as.integer(chmatch(neworder, names(x)))
-    if (any(is.na(o))) stop("Names in neworder not found in x: ", paste(neworder[is.na(o)], collapse=","))
+    if (anyNA(o)) stop("Names in neworder not found in x: ", paste(neworder[is.na(o)], collapse=","))
   } else {
     if (!is.numeric(neworder)) stop("neworder is not a character or numeric vector")
     o = as.integer(neworder)

@@ -12,33 +12,62 @@ duplicated.data.table <- function(x, incomparables=FALSE, fromLast=FALSE, by=seq
   # fix for bug #5405 - unique on null data table returns error (because of 'forderv')
   # however, in this case we can bypass having to go to forderv at all.
   if (!length(query$by)) return(logical(0))
-  res <- rep.int(TRUE, nrow(x))
 
   if (query$use.keyprefix) {
     f = uniqlist(shallow(x, query$by))
     if (fromLast) f = cumsum(uniqlengths(f, nrow(x)))
   } else {
     o = forderv(x, by=query$by, sort=FALSE, retGrp=TRUE)
+    if (attr(o, 'maxgrpn') == 1L) return(rep.int(FALSE, nrow(x)))
     f = attr(o,"starts")
     if (fromLast) f = cumsum(uniqlengths(f, nrow(x)))
-    if (length(o)) f=o[f]
+    if (length(o)) f = o[f]
   }
+  res <- rep.int(TRUE, nrow(x))
   res[f] = FALSE
   res
 }
 
 unique.data.table <- function(x, incomparables=FALSE, fromLast=FALSE, by=seq_along(x), ...) {
   if (!cedta()) return(NextMethod("unique"))
-  if (missing(by) && isTRUE(getOption("datatable.old.unique.by.key")))  #1284
-    by = key(x)
-  dups <- duplicated.data.table(x, incomparables, fromLast, by, ...)
-  ans <- .Call(CsubsetDT, x, which_(dups, FALSE), seq_len(ncol(x))) # more memory efficient version of which(!dups)
-  if (nrow(x) != nrow(ans)) setindexv(ans, NULL)[] else ans #1760
-  # i.e. x[!dups] but avoids [.data.table overhead when unique() is loop'd
-  # TO DO: allow logical to be passed through to C level, and allow cols=NULL to mean all, for further speed gain.
+  if (!identical(incomparables, FALSE)) {
+    .NotYetUsed("incomparables != FALSE")
+  }
+  if (nrow(x) <= 1L) return(x)
+  if (missing(by) && isTRUE(getOption("datatable.old.unique.by.key"))) by = key(x)  #1284
+  else if (is.null(by)) by=seq_along(x)
+  o = forderv(x, by=by, sort=FALSE, retGrp=TRUE)
+  # if by=key(x), forderv tests for orderedness within it quickly and will short-circuit
+  # there isn't any need in unique() to call uniqlist like duplicated does; uniqlist retuns a new nrow(x) vector anyway and isn't
+  # as efficient as forderv returning empty o when input is already ordered
+  if (attr(o, 'maxgrpn') == 1L) return(x)  # avoid copy. Oftentimes, user just wants to check DT is unique with perhaps nrow(unique(DT))==nrow(DT)
+  f = attr(o,"starts")
+  if (fromLast) f = cumsum(uniqlengths(f, nrow(x)))
+  if (length(o)) f = o[f]
+  if (length(o <- forderv(f))) f = f[o]  # don't sort the uniques too
+  .Call(CsubsetDT, x, f, seq_len(ncol(x)))
+  # TO DO: allow by=NULL to mean all, for further speed gain.
   #        See news for v1.9.3 for link to benchmark use-case on datatable-help.
 }
 
+# Test for #2013 unique() memory efficiency improvement in v1.10.5
+# set.seed(1)
+# Create unique 7.6GB DT on 16GB laptop
+# DT = data.table(
+#  A = sample(1e8, 2e8, TRUE),
+#  B = sample(1e8, 2e8, TRUE),
+#  C = 1:2e8,
+#  D = 1:2e8,
+#  E = 1:2e8,
+#  F = 1:2e8,
+#  G = 1:2e8,
+#  H = 1:2e8,
+#  I = 1:2e8,
+#  J = 1:2e8
+# )
+# print(dim(unique(DT)))  # works now, failed with oom in 1.10.4-3
+
+
 ## Specify the column names to be used in the uniqueness query, and if this
 ## query can take advantage of the keys of `x` (if present).
 ## returns a list

@@ -97,7 +97,7 @@ aggregate_funs <- function(funs, vals, sep="_", ...) {
 dcast.data.table <- function(data, formula, fun.aggregate = NULL, sep = "_", ..., margins = NULL, subset = NULL, fill = NULL, drop = TRUE, value.var = guess(data), verbose = getOption("datatable.verbose")) {
   if (!is.data.table(data)) stop("'data' must be a data.table.")
   drop = as.logical(rep(drop, length.out=2L))
-  if (any(is.na(drop))) stop("'drop' must be logical TRUE/FALSE")
+  if (anyNA(drop)) stop("'drop' must be logical TRUE/FALSE")
   lvals = value_vars(value.var, names(data))
   valnames = unique(unlist(lvals))
   lvars = check_formula(formula, names(data), valnames)

@@ -37,9 +37,9 @@ foverlaps <- function(x, y, by.x = if (!is.null(key(x))) key(x) else key(y), by.
     stop("A non-empty vector of column names is required for by.y")
   if (!identical(by.y, key(y)[seq_along(by.y)]))
     stop("The first ", length(by.y), " columns of y's key is not identical to the columns specified in by.y.")
-  if (any(is.na(chmatch(by.x, names(x)))))
+  if (anyNA(chmatch(by.x, names(x))))
     stop("Elements listed in 'by.x' must be valid names in data.table 'x'")
-  if (any(is.na(chmatch(by.y, names(y)))))
+  if (anyNA(chmatch(by.y, names(y))))
     stop("Elements listed in 'by.y' must be valid names in data.table 'y'")
   if (anyDuplicated(by.x) || anyDuplicated(by.y))
     stop("Duplicate columns are not allowed in overlap joins. This may change in the future.")

@@ -134,10 +134,9 @@ setreordervec <- function(x, order) .Call(Creorder, x, order)
 # Maybe just a grep through *.R for use of these function internally would be better (TO DO).
 
 # Don't use base::is.unsorted internally, because :
-#    1) it returns NA if any(is.na(.)) where NAs are detected at R level, inefficiently
-#    2) it uses locale whereas in data.table we control locale sorting independently (C locale currently, but
+#    1) it uses locale whereas in data.table we control locale sorting independently (C locale currently, but
 #       "sorted" attribute will need an extra attribute "locale" so we can check if key's locale is the current locale)
-#    3) wrapper needed, used to be :
+#    2) wrapper needed, used to be :
 #       identical(FALSE,is.unsorted(x)) && !(length(x)==1 && is.na(x))
 #       where the && was needed to maintain backwards compatibility after r-devel's change of is.unsorted(NA) to FALSE (was NA) [May 2013].
 # The others (order, sort.int etc) are turned off to protect ourselves from using them internally, for speed and for
@@ -177,7 +176,14 @@ forderv <- function(x, by=seq_along(x), retGrp=FALSE, sort=TRUE, order=1L, na.la
   } else {
     if (!length(x)) return(integer(0)) # to be consistent with base::order. this'll make sure forderv(NULL) will result in error
                        # (as base does) but forderv(data.table(NULL)) and forderv(list()) will return integer(0))
-    if (is.character(by)) by=chmatch(by, names(x))
+    if (is.character(by)) {
+      w = chmatch(by, names(x))
+      if (anyNA(w)) stop("'by' contains '",by[is.na(w)][1],"' which is not a column name")
+      by = w
+    }
+    else if (typeof(by)=="double" && isReallyReal(by)) {
+      stop("'by' is type 'double' but one or more items in it are not whole integers")
+    }
     by = as.integer(by)
     if ( (length(order) != 1L && length(order) != length(by)) || any(!order %in% c(1L, -1L)) )
       stop("x is a list, length(order) must be either =1 or =length(by) and each value should be 1 or -1 for each column in 'by', corresponding to ascending or descending order, respectively. If length(order) == 1, it will be recycled to length(by).")

@@ -2812,10 +2812,10 @@ test(988, unique(dt, by='B'), dt[!duplicated(df[, 'B'])])
 test(989, unique(dt, by='C'), dt[!duplicated(df[, 'C'])])
 test(990, unique(dt, by=c('B', 'C')), dt[!duplicated(df[, c('B', 'C')])])
 test(991, unique(dt, by=NULL), dt[!duplicated(df)])
-test(991.1, unique(dt, by=4), error="Integer values between 1 and ncol are required")
-test(991.2, unique(dt, by=c(1,3.1)), error="Integer values between 1 and ncol are required")
+test(991.1, unique(dt, by=4), error="'by' value 4 out of range.*1,3")
+test(991.2, unique(dt, by=c(1,3.1)), error="'by' is type 'double' but one or more items in it are not whole integers")
 test(991.3, unique(dt, by=2:3), dt[!duplicated(df[,c('B','C')])])
-test(991.4, unique(dt, by=c('C','D','E')), error="by specifies column names that do not exist. First 5: D,E")
+test(991.4, unique(dt, by=c('C','D','E')), error="'by' contains 'D' which is not a column name")
 
 # :=NULL on factor column in empty data.table, #4809
 DT = data.table(A = integer(), B = factor())
@@ -11243,6 +11243,10 @@ test(1859, fread("A\n", nrows=0), data.table(A=logical()))  # 2512
 
 test(1860, fread("A,B\n "), data.table(A=logical(), B=logical()))  # 2543
 
+# That unique(DT) returns DT when there are no dups, #2013
+DT = data.table(A=c(1L,1L,2L), B=c(3L,4L,4L))
+test(1861, address(unique(DT)), address(DT))
+
 ##########################
 
 # TODO: Tests involving GForce functions needs to be run with optimisation level 1 and 2, so that both functions are tested all the time.

@@ -46,6 +46,7 @@ SEXP char_Date;
 SEXP char_POSIXct;
 SEXP char_nanotime;
 SEXP sym_sorted;
+SEXP sym_index;
 SEXP sym_BY;
 SEXP sym_starts, char_starts;
 SEXP sym_maxgrpn;

@@ -248,6 +248,7 @@ void attribute_visible R_init_datatable(DllInfo *info)
   // look odd (and devs in future might be tempted to remove them). Avoiding passing install() to API calls
   // keeps the code neat and readable. Also see grep's added to CRAN_Release.cmd to find such calls.
   sym_sorted  = install("sorted");
+  sym_index   = install("index");
   sym_BY      = install(".BY");
   sym_maxgrpn = install("maxgrpn");
 

@@ -125,7 +125,7 @@ static SEXP subsetVectorRaw(SEXP target, SEXP source, SEXP idx, Rboolean any0orN
   return target;
 }
 
-static void check_idx(SEXP idx, int max, /*outputs...*/int *ansLen, Rboolean *any0orNA)
+static void check_idx(SEXP idx, int max, /*outputs...*/int *ansLen, Rboolean *any0orNA, Rboolean *monotonic)
 // count non-0 in idx => the length of the subset result stored in *ansLen
 // return whether any 0, NA (or >max) exist and set any0orNA if so, for branchless subsetVectorRaw
 // >max is treated as NA for consistency with [.data.frame and operations like cbind(DT[w],DT[w+1])
@@ -135,17 +135,21 @@ static void check_idx(SEXP idx, int max, /*outputs...*/int *ansLen, Rboolean *an
 // single cache efficient sweep so no need to go parallel (well, very low priority to go parallel)
 {
   if (!isInteger(idx)) error("Internal error. 'idx' is type '%s' not 'integer'", type2char(TYPEOF(idx)));
-  Rboolean anyNeg=FALSE, anyNA=FALSE;
+  Rboolean anyNeg=FALSE, anyNA=FALSE, anyLess=FALSE;
   int ans=0;
+  int last = INT32_MIN;
   for (int i=0; i<LENGTH(idx); i++) {
     int this = INTEGER(idx)[i];
     ans += (this!=0);
     anyNeg |= this<0 && this!=NA_INTEGER;
     anyNA |= this==NA_INTEGER || this>max;
+    anyLess |= this<last;
+    last = this;
   }
   if (anyNeg) error("Internal error: idx contains negatives. Should have been dealt with earlier.");
   *ansLen = ans;
   *any0orNA = ans<LENGTH(idx) || anyNA;
+  *monotonic = !anyLess; // for the purpose of ordered keys, this==last is allowed
 }
 
 // TODO - currently called from R level first. Can it be called from check_idx instead?
@@ -222,8 +226,8 @@ SEXP subsetDT(SEXP x, SEXP rows, SEXP cols) {
 
   // check index once up front for 0 or NA, for branchless subsetVectorRaw
   R_len_t ansn=0;
-  Rboolean any0orNA=FALSE;
-  check_idx(rows, length(VECTOR_ELT(x,0)), &ansn, &any0orNA);
+  Rboolean any0orNA=FALSE, orderedSubset=FALSE;
+  check_idx(rows, length(VECTOR_ELT(x,0)), &ansn, &any0orNA, &orderedSubset);
 
   if (!isInteger(cols)) error("Internal error. Argument 'cols' to Csubset is type '%s' not 'integer'", type2char(TYPEOF(cols)));
   for (int i=0; i<LENGTH(cols); i++) {
@@ -276,7 +280,9 @@ SEXP subsetDT(SEXP x, SEXP rows, SEXP cols) {
   setAttrib(ans, R_RowNamesSymbol, tmp);  // The contents of tmp must be set before being passed to setAttrib(). setAttrib looks at tmp value and copies it in the case of R_RowNamesSymbol. Caused hard to track bug around 28 Sep 2014.
   UNPROTECT(1);
 
-  // maintain key if ordered subset ...
+  // clear any index that was copied over by copyMostAttrib() above, e.g. #1760 and #1734 (test 1678)
+  setAttrib(ans, sym_index, R_NilValue);
+  // but maintain key if ordered subset
   SEXP key = getAttrib(x, sym_sorted);
   if (length(key)) {
     SEXP in = PROTECT(chmatch(key,getAttrib(ans,R_NamesSymbol), 0, TRUE)); // (nomatch ignored when in=TRUE)
@@ -287,11 +293,10 @@ SEXP subsetDT(SEXP x, SEXP rows, SEXP cols) {
       setAttrib(ans, sym_sorted, R_NilValue);
       // clear key that was copied over by copyMostAttrib() above
     } else {
-      if (isOrderedSubset(rows, PROTECT(ScalarInteger(length(VECTOR_ELT(x,0)))))) {
+      if (orderedSubset) {
         setAttrib(ans, sym_sorted, tmp=allocVector(STRSXP, i));
         for (int j=0; j<i; j++) SET_STRING_ELT(tmp, j, STRING_ELT(key, j));
       }
-      UNPROTECT(1);  // the ScalarInteger above. isOrderedSubset() is exposed at R level hence needs SEXP
     }
   }
   setAttrib(ans, install(".data.table.locked"), R_NilValue);
@@ -302,8 +307,8 @@ SEXP subsetDT(SEXP x, SEXP rows, SEXP cols) {
 
 SEXP subsetVector(SEXP x, SEXP idx) { // idx is 1-based passed from R level
   int ansn;
-  Rboolean any0orNA;
-  check_idx(idx, length(x), &ansn, &any0orNA);
+  Rboolean any0orNA, orderedSubset;
+  check_idx(idx, length(x), &ansn, &any0orNA, &orderedSubset);
   SEXP ans = PROTECT(allocVector(TYPEOF(x), ansn));
   SETLENGTH(ans, ansn);
   SET_TRUELENGTH(ans, ansn);
@@ -313,4 +318,3 @@ SEXP subsetVector(SEXP x, SEXP idx) { // idx is 1-based passed from R level
   return ans;
 }
 
-
Original file line number	Diff line number	Diff line change
Expand Up		@@ -59,6 +59,7 @@

		12. Faster `as.IDate` and `as.ITime` methods for `POSIXct` and `numeric`, [#1392](https://github.com/Rdatatable/data.table/issues/1392). Thanks to Jan Gorecki for the PR.

		13. `unique(DT)` now returns `DT` early when there are no duplicates to save RAM, [#2013](https://github.com/Rdatatable/data.table/issues/2013). Thanks to Michael Chirico for the PR.

		#### BUG FIXES

Expand Down