Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@

12. Faster `as.IDate` and `as.ITime` methods for `POSIXct` and `numeric`, [#1392](https://github.com/Rdatatable/data.table/issues/1392). Thanks to Jan Gorecki for the PR.

13. `unique(DT)` now returns `DT` early when there are no duplicates to save RAM, [#2013](https://github.com/Rdatatable/data.table/issues/2013). Thanks to Michael Chirico for the PR.

#### BUG FIXES

Expand Down
20 changes: 11 additions & 9 deletions R/data.table.R
Original file line number Diff line number Diff line change
Expand Up @@ -811,7 +811,7 @@ chmatch2 <- function(x, table, nomatch=NA_integer_) {
if (is.character(j)) {
if (notj) {
w = chmatch(j, names(x))
if (any(is.na(w))) {
if (anyNA(w)) {
warning("column(s) not removed because not found: ",paste(j[is.na(w)],collapse=","))
w = w[!is.na(w)]
}
Expand Down Expand Up @@ -1036,12 +1036,12 @@ chmatch2 <- function(x, table, nomatch=NA_integer_) {
} else if (is.numeric(.SDcols)) {
# if .SDcols is numeric, use 'dupdiff' instead of 'setdiff'
if (length(unique(sign(.SDcols))) != 1L) stop(".SDcols is numeric but has both +ve and -ve indices")
if (any(is.na(.SDcols)) || any(abs(.SDcols)>ncol(x)) || any(abs(.SDcols)<1L)) stop(".SDcols is numeric but out of bounds (or NA)")
if (anyNA(.SDcols) || any(abs(.SDcols)>ncol(x)) || any(abs(.SDcols)<1L)) stop(".SDcols is numeric but out of bounds (or NA)")
if (colm) ansvars = dupdiff(names(x)[-.SDcols], bynames) else ansvars = names(x)[.SDcols]
ansvals = if (colm) setdiff(seq_along(names(x)), c(as.integer(.SDcols), which(names(x) %chin% bynames))) else as.integer(.SDcols)
} else {
if (!is.character(.SDcols)) stop(".SDcols should be column numbers or names")
if (any(is.na(.SDcols)) || any(!.SDcols %chin% names(x))) stop("Some items of .SDcols are not column names (or are NA)")
if (anyNA(.SDcols) || any(!.SDcols %chin% names(x))) stop("Some items of .SDcols are not column names (or are NA)")
if (colm) ansvars = setdiff(setdiff(names(x), .SDcols), bynames) else ansvars = .SDcols
# dups = FALSE here. DT[, .SD, .SDcols=c("x", "x")] again doesn't really help with which 'x' to keep (and if '-' which x to remove)
ansvals = chmatch(ansvars, names(x))
Expand Down Expand Up @@ -1979,7 +1979,7 @@ tail.data.table <- function(x, n=6, ...) {
} else i = NULL # meaning (to C code) all rows, without allocating 1L:nrow(x) vector
if (missing(j)) j=names(x)
if (!is.atomic(j)) stop("j must be atomic vector, see ?is.atomic")
if (any(is.na(j))) stop("NA in j")
if (anyNA(j)) stop("NA in j")
if (is.character(j)) {
newnames = setdiff(j,names(x))
cols = as.integer(chmatch(j, c(names(x),newnames)))
Expand Down Expand Up @@ -2197,6 +2197,7 @@ is_na <- function(x, by=seq_along(x)) .Call(Cdt_na, x, by)
any_na <- function(x, by=seq_along(x)) .Call(CanyNA, x, by)

na.omit.data.table <- function (object, cols = seq_along(object), invert = FALSE, ...) {
# compare to stats:::na.omit.data.frame
if (!cedta()) return(NextMethod())
if ( !missing(invert) && is.na(as.logical(invert)) )
stop("Argument 'invert' must be logical TRUE/FALSE")
Expand All @@ -2209,9 +2210,10 @@ na.omit.data.table <- function (object, cols = seq_along(object), invert = FALSE
}
cols = as.integer(cols)
ix = .Call(Cdt_na, object, cols)
ans = .Call(CsubsetDT, object, which_(ix, bool = invert), seq_along(object))
if (any(ix)) setindexv(ans, NULL)[] else ans #1734
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Where is removing index now? Won't we end up with corrupted index here?

Copy link
Copy Markdown
Member

@MichaelChirico MichaelChirico Jan 15, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jangorecki hmm can't say I know... @mattdowle I think this came from your commit: aacf2b9

Copy link
Copy Markdown
Member

@jangorecki jangorecki Jan 26, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

AFAIK it will make index corrupted. Once #1762 will be solved we don't need to care about that anymore. Could you ensure there is a unit test for this index corruption now? @MichaelChirico update: probably solved already by Matt, details in mentioned issue.

Copy link
Copy Markdown
Member Author

@mattdowle mattdowle Feb 6, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jangorecki Yes, as you noted in #1762 the index is removed inside CsubsetDT. It's better to remove it in a central place at C level as close to where the update happens, rather than having to remember to clear the index each time we call CsubsetDT. It contains the comment "// clear any index that was copied over by copyMostAttrib() above, e.g. #1760 and #1734 (test 1678)"

# compare the above to stats:::na.omit.data.frame
if (any(ix))
.Call(CsubsetDT, object, which_(ix, bool = invert), seq_along(object))
else
object
}

which_ <- function(x, bool = TRUE) {
Expand Down Expand Up @@ -2479,7 +2481,7 @@ setnames <- function(x,old,new) {
if (!is.character(old)) stop("'old' is type ",typeof(old)," but should be integer, double or character")
if (any(duplicated(old))) stop("Some duplicates exist in 'old': ", paste(old[duplicated(old)],collapse=","))
i = chmatch(old,names(x))
if (any(is.na(i))) stop("Items of 'old' not found in column names: ",paste(old[is.na(i)],collapse=","))
if (anyNA(i)) stop("Items of 'old' not found in column names: ",paste(old[is.na(i)],collapse=","))
if (any(tt<-!is.na(chmatch(old,names(x)[-i])))) stop("Some items of 'old' are duplicated (ambiguous) in column names: ",paste(old[tt],collapse=","))
}
if (length(new)!=length(i)) stop("'old' is length ",length(i)," but 'new' is length ",length(new))
Expand Down Expand Up @@ -2524,7 +2526,7 @@ setcolorder <- function(x, neworder)
if (is.character(neworder)) {
if (any(duplicated(names(x)))) stop("x has some duplicated column name(s): ", paste(names(x)[duplicated(names(x))], collapse=","), ". Please remove or rename the duplicate(s) and try again.")
o = as.integer(chmatch(neworder, names(x)))
if (any(is.na(o))) stop("Names in neworder not found in x: ", paste(neworder[is.na(o)], collapse=","))
if (anyNA(o)) stop("Names in neworder not found in x: ", paste(neworder[is.na(o)], collapse=","))
} else {
if (!is.numeric(neworder)) stop("neworder is not a character or numeric vector")
o = as.integer(neworder)
Expand Down
47 changes: 38 additions & 9 deletions R/duplicated.R
Original file line number Diff line number Diff line change
Expand Up @@ -12,33 +12,62 @@ duplicated.data.table <- function(x, incomparables=FALSE, fromLast=FALSE, by=seq
# fix for bug #5405 - unique on null data table returns error (because of 'forderv')
# however, in this case we can bypass having to go to forderv at all.
if (!length(query$by)) return(logical(0))
res <- rep.int(TRUE, nrow(x))

if (query$use.keyprefix) {
f = uniqlist(shallow(x, query$by))
if (fromLast) f = cumsum(uniqlengths(f, nrow(x)))
} else {
o = forderv(x, by=query$by, sort=FALSE, retGrp=TRUE)
if (attr(o, 'maxgrpn') == 1L) return(rep.int(FALSE, nrow(x)))
f = attr(o,"starts")
if (fromLast) f = cumsum(uniqlengths(f, nrow(x)))
if (length(o)) f=o[f]
if (length(o)) f = o[f]
}
res <- rep.int(TRUE, nrow(x))
res[f] = FALSE
res
}

unique.data.table <- function(x, incomparables=FALSE, fromLast=FALSE, by=seq_along(x), ...) {
if (!cedta()) return(NextMethod("unique"))
if (missing(by) && isTRUE(getOption("datatable.old.unique.by.key"))) #1284
by = key(x)
dups <- duplicated.data.table(x, incomparables, fromLast, by, ...)
ans <- .Call(CsubsetDT, x, which_(dups, FALSE), seq_len(ncol(x))) # more memory efficient version of which(!dups)
if (nrow(x) != nrow(ans)) setindexv(ans, NULL)[] else ans #1760
# i.e. x[!dups] but avoids [.data.table overhead when unique() is loop'd
# TO DO: allow logical to be passed through to C level, and allow cols=NULL to mean all, for further speed gain.
if (!identical(incomparables, FALSE)) {
.NotYetUsed("incomparables != FALSE")
}
if (nrow(x) <= 1L) return(x)
if (missing(by) && isTRUE(getOption("datatable.old.unique.by.key"))) by = key(x) #1284
else if (is.null(by)) by=seq_along(x)
o = forderv(x, by=by, sort=FALSE, retGrp=TRUE)
# if by=key(x), forderv tests for orderedness within it quickly and will short-circuit
# there isn't any need in unique() to call uniqlist like duplicated does; uniqlist retuns a new nrow(x) vector anyway and isn't
# as efficient as forderv returning empty o when input is already ordered
if (attr(o, 'maxgrpn') == 1L) return(x) # avoid copy. Oftentimes, user just wants to check DT is unique with perhaps nrow(unique(DT))==nrow(DT)
f = attr(o,"starts")
if (fromLast) f = cumsum(uniqlengths(f, nrow(x)))
if (length(o)) f = o[f]
if (length(o <- forderv(f))) f = f[o] # don't sort the uniques too
.Call(CsubsetDT, x, f, seq_len(ncol(x)))
# TO DO: allow by=NULL to mean all, for further speed gain.
# See news for v1.9.3 for link to benchmark use-case on datatable-help.
}

# Test for #2013 unique() memory efficiency improvement in v1.10.5
# set.seed(1)
# Create unique 7.6GB DT on 16GB laptop
# DT = data.table(
# A = sample(1e8, 2e8, TRUE),
# B = sample(1e8, 2e8, TRUE),
# C = 1:2e8,
# D = 1:2e8,
# E = 1:2e8,
# F = 1:2e8,
# G = 1:2e8,
# H = 1:2e8,
# I = 1:2e8,
# J = 1:2e8
# )
# print(dim(unique(DT))) # works now, failed with oom in 1.10.4-3


## Specify the column names to be used in the uniqueness query, and if this
## query can take advantage of the keys of `x` (if present).
## returns a list
Expand Down
2 changes: 1 addition & 1 deletion R/fcast.R
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ aggregate_funs <- function(funs, vals, sep="_", ...) {
dcast.data.table <- function(data, formula, fun.aggregate = NULL, sep = "_", ..., margins = NULL, subset = NULL, fill = NULL, drop = TRUE, value.var = guess(data), verbose = getOption("datatable.verbose")) {
if (!is.data.table(data)) stop("'data' must be a data.table.")
drop = as.logical(rep(drop, length.out=2L))
if (any(is.na(drop))) stop("'drop' must be logical TRUE/FALSE")
if (anyNA(drop)) stop("'drop' must be logical TRUE/FALSE")
lvals = value_vars(value.var, names(data))
valnames = unique(unlist(lvals))
lvars = check_formula(formula, names(data), valnames)
Expand Down
4 changes: 2 additions & 2 deletions R/foverlaps.R
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,9 @@ foverlaps <- function(x, y, by.x = if (!is.null(key(x))) key(x) else key(y), by.
stop("A non-empty vector of column names is required for by.y")
if (!identical(by.y, key(y)[seq_along(by.y)]))
stop("The first ", length(by.y), " columns of y's key is not identical to the columns specified in by.y.")
if (any(is.na(chmatch(by.x, names(x)))))
if (anyNA(chmatch(by.x, names(x))))
stop("Elements listed in 'by.x' must be valid names in data.table 'x'")
if (any(is.na(chmatch(by.y, names(y)))))
if (anyNA(chmatch(by.y, names(y))))
stop("Elements listed in 'by.y' must be valid names in data.table 'y'")
if (anyDuplicated(by.x) || anyDuplicated(by.y))
stop("Duplicate columns are not allowed in overlap joins. This may change in the future.")
Expand Down
14 changes: 10 additions & 4 deletions R/setkey.R
Original file line number Diff line number Diff line change
Expand Up @@ -134,10 +134,9 @@ setreordervec <- function(x, order) .Call(Creorder, x, order)
# Maybe just a grep through *.R for use of these function internally would be better (TO DO).

# Don't use base::is.unsorted internally, because :
# 1) it returns NA if any(is.na(.)) where NAs are detected at R level, inefficiently
# 2) it uses locale whereas in data.table we control locale sorting independently (C locale currently, but
# 1) it uses locale whereas in data.table we control locale sorting independently (C locale currently, but
# "sorted" attribute will need an extra attribute "locale" so we can check if key's locale is the current locale)
# 3) wrapper needed, used to be :
# 2) wrapper needed, used to be :
# identical(FALSE,is.unsorted(x)) && !(length(x)==1 && is.na(x))
# where the && was needed to maintain backwards compatibility after r-devel's change of is.unsorted(NA) to FALSE (was NA) [May 2013].
# The others (order, sort.int etc) are turned off to protect ourselves from using them internally, for speed and for
Expand Down Expand Up @@ -177,7 +176,14 @@ forderv <- function(x, by=seq_along(x), retGrp=FALSE, sort=TRUE, order=1L, na.la
} else {
if (!length(x)) return(integer(0)) # to be consistent with base::order. this'll make sure forderv(NULL) will result in error
# (as base does) but forderv(data.table(NULL)) and forderv(list()) will return integer(0))
if (is.character(by)) by=chmatch(by, names(x))
if (is.character(by)) {
w = chmatch(by, names(x))
if (anyNA(w)) stop("'by' contains '",by[is.na(w)][1],"' which is not a column name")
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

heads up that anyNA is R 3.1, in case we decide to keep the 3.0 dependency

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here changing to any(is.na(.)) won't hurt much, I don't think grouping by million of columns would be useful.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Comment for completeness ... yep good point, we're now >= R 3.1 for other reasons, so can use anyNA

by = w
}
else if (typeof(by)=="double" && isReallyReal(by)) {
stop("'by' is type 'double' but one or more items in it are not whole integers")
}
by = as.integer(by)
if ( (length(order) != 1L && length(order) != length(by)) || any(!order %in% c(1L, -1L)) )
stop("x is a list, length(order) must be either =1 or =length(by) and each value should be 1 or -1 for each column in 'by', corresponding to ascending or descending order, respectively. If length(order) == 1, it will be recycled to length(by).")
Expand Down
10 changes: 7 additions & 3 deletions inst/tests/tests.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -2812,10 +2812,10 @@ test(988, unique(dt, by='B'), dt[!duplicated(df[, 'B'])])
test(989, unique(dt, by='C'), dt[!duplicated(df[, 'C'])])
test(990, unique(dt, by=c('B', 'C')), dt[!duplicated(df[, c('B', 'C')])])
test(991, unique(dt, by=NULL), dt[!duplicated(df)])
test(991.1, unique(dt, by=4), error="Integer values between 1 and ncol are required")
test(991.2, unique(dt, by=c(1,3.1)), error="Integer values between 1 and ncol are required")
test(991.1, unique(dt, by=4), error="'by' value 4 out of range.*1,3")
test(991.2, unique(dt, by=c(1,3.1)), error="'by' is type 'double' but one or more items in it are not whole integers")
test(991.3, unique(dt, by=2:3), dt[!duplicated(df[,c('B','C')])])
test(991.4, unique(dt, by=c('C','D','E')), error="by specifies column names that do not exist. First 5: D,E")
test(991.4, unique(dt, by=c('C','D','E')), error="'by' contains 'D' which is not a column name")

# :=NULL on factor column in empty data.table, #4809
DT = data.table(A = integer(), B = factor())
Expand Down Expand Up @@ -11243,6 +11243,10 @@ test(1859, fread("A\n", nrows=0), data.table(A=logical())) # 2512

test(1860, fread("A,B\n "), data.table(A=logical(), B=logical())) # 2543

# That unique(DT) returns DT when there are no dups, #2013
DT = data.table(A=c(1L,1L,2L), B=c(3L,4L,4L))
test(1861, address(unique(DT)), address(DT))

##########################

# TODO: Tests involving GForce functions needs to be run with optimisation level 1 and 2, so that both functions are tested all the time.
Expand Down
1 change: 1 addition & 0 deletions src/data.table.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ SEXP char_Date;
SEXP char_POSIXct;
SEXP char_nanotime;
SEXP sym_sorted;
SEXP sym_index;
SEXP sym_BY;
SEXP sym_starts, char_starts;
SEXP sym_maxgrpn;
Expand Down
1 change: 1 addition & 0 deletions src/init.c
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,7 @@ void attribute_visible R_init_datatable(DllInfo *info)
// look odd (and devs in future might be tempted to remove them). Avoiding passing install() to API calls
// keeps the code neat and readable. Also see grep's added to CRAN_Release.cmd to find such calls.
sym_sorted = install("sorted");
sym_index = install("index");
sym_BY = install(".BY");
sym_maxgrpn = install("maxgrpn");

Expand Down
24 changes: 14 additions & 10 deletions src/subset.c
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ static SEXP subsetVectorRaw(SEXP target, SEXP source, SEXP idx, Rboolean any0orN
return target;
}

static void check_idx(SEXP idx, int max, /*outputs...*/int *ansLen, Rboolean *any0orNA)
static void check_idx(SEXP idx, int max, /*outputs...*/int *ansLen, Rboolean *any0orNA, Rboolean *monotonic)
// count non-0 in idx => the length of the subset result stored in *ansLen
// return whether any 0, NA (or >max) exist and set any0orNA if so, for branchless subsetVectorRaw
// >max is treated as NA for consistency with [.data.frame and operations like cbind(DT[w],DT[w+1])
Expand All @@ -135,17 +135,21 @@ static void check_idx(SEXP idx, int max, /*outputs...*/int *ansLen, Rboolean *an
// single cache efficient sweep so no need to go parallel (well, very low priority to go parallel)
{
if (!isInteger(idx)) error("Internal error. 'idx' is type '%s' not 'integer'", type2char(TYPEOF(idx)));
Rboolean anyNeg=FALSE, anyNA=FALSE;
Rboolean anyNeg=FALSE, anyNA=FALSE, anyLess=FALSE;
int ans=0;
int last = INT32_MIN;
for (int i=0; i<LENGTH(idx); i++) {
int this = INTEGER(idx)[i];
ans += (this!=0);
anyNeg |= this<0 && this!=NA_INTEGER;
anyNA |= this==NA_INTEGER || this>max;
anyLess |= this<last;
last = this;
}
if (anyNeg) error("Internal error: idx contains negatives. Should have been dealt with earlier.");
*ansLen = ans;
*any0orNA = ans<LENGTH(idx) || anyNA;
*monotonic = !anyLess; // for the purpose of ordered keys, this==last is allowed
}

// TODO - currently called from R level first. Can it be called from check_idx instead?
Expand Down Expand Up @@ -222,8 +226,8 @@ SEXP subsetDT(SEXP x, SEXP rows, SEXP cols) {

// check index once up front for 0 or NA, for branchless subsetVectorRaw
R_len_t ansn=0;
Rboolean any0orNA=FALSE;
check_idx(rows, length(VECTOR_ELT(x,0)), &ansn, &any0orNA);
Rboolean any0orNA=FALSE, orderedSubset=FALSE;
check_idx(rows, length(VECTOR_ELT(x,0)), &ansn, &any0orNA, &orderedSubset);

if (!isInteger(cols)) error("Internal error. Argument 'cols' to Csubset is type '%s' not 'integer'", type2char(TYPEOF(cols)));
for (int i=0; i<LENGTH(cols); i++) {
Expand Down Expand Up @@ -276,7 +280,9 @@ SEXP subsetDT(SEXP x, SEXP rows, SEXP cols) {
setAttrib(ans, R_RowNamesSymbol, tmp); // The contents of tmp must be set before being passed to setAttrib(). setAttrib looks at tmp value and copies it in the case of R_RowNamesSymbol. Caused hard to track bug around 28 Sep 2014.
UNPROTECT(1);

// maintain key if ordered subset ...
// clear any index that was copied over by copyMostAttrib() above, e.g. #1760 and #1734 (test 1678)
setAttrib(ans, sym_index, R_NilValue);
// but maintain key if ordered subset
SEXP key = getAttrib(x, sym_sorted);
if (length(key)) {
SEXP in = PROTECT(chmatch(key,getAttrib(ans,R_NamesSymbol), 0, TRUE)); // (nomatch ignored when in=TRUE)
Expand All @@ -287,11 +293,10 @@ SEXP subsetDT(SEXP x, SEXP rows, SEXP cols) {
setAttrib(ans, sym_sorted, R_NilValue);
// clear key that was copied over by copyMostAttrib() above
} else {
if (isOrderedSubset(rows, PROTECT(ScalarInteger(length(VECTOR_ELT(x,0)))))) {
if (orderedSubset) {
setAttrib(ans, sym_sorted, tmp=allocVector(STRSXP, i));
for (int j=0; j<i; j++) SET_STRING_ELT(tmp, j, STRING_ELT(key, j));
}
UNPROTECT(1); // the ScalarInteger above. isOrderedSubset() is exposed at R level hence needs SEXP
}
}
setAttrib(ans, install(".data.table.locked"), R_NilValue);
Expand All @@ -302,8 +307,8 @@ SEXP subsetDT(SEXP x, SEXP rows, SEXP cols) {

SEXP subsetVector(SEXP x, SEXP idx) { // idx is 1-based passed from R level
int ansn;
Rboolean any0orNA;
check_idx(idx, length(x), &ansn, &any0orNA);
Rboolean any0orNA, orderedSubset;
check_idx(idx, length(x), &ansn, &any0orNA, &orderedSubset);
SEXP ans = PROTECT(allocVector(TYPEOF(x), ansn));
SETLENGTH(ans, ansn);
SET_TRUELENGTH(ans, ansn);
Expand All @@ -313,4 +318,3 @@ SEXP subsetVector(SEXP x, SEXP idx) { // idx is 1-based passed from R level
return ans;
}