From 87f4ee6314507592be0be96d3db1353f51c16ca8 Mon Sep 17 00:00:00 2001 From: "NORDEXAG\\BonschM" Date: Thu, 7 Sep 2017 13:35:50 +0200 Subject: [PATCH 1/6] Fixed .shallow to consistently retain keys and indices. --- NEWS.md | 3 +++ R/data.table.R | 39 +++++++++++++++++++++++++++++++++------ R/foverlaps.R | 3 --- inst/tests/tests.Rraw | 32 +++++++++++++++++++++++++++++++- 4 files changed, 67 insertions(+), 10 deletions(-) diff --git a/NEWS.md b/NEWS.md index 39b680c321..2993361b55 100644 --- a/NEWS.md +++ b/NEWS.md @@ -78,6 +78,9 @@ 17. `.SD` would incorrectly include symbol on lhs of `:=` when `.SDcols` is specified and `get()` appears in `j`. Thanks @renkun-ken for reporting and the PR. Closes #2326. +18. .shallow retains keys correctly [#2336](https://github.com/Rdatatable/data.table/issues/2336) + Thanks to @MarkusBonsch for reporting and fixing + #### NOTES 1. `?data.table` makes explicit the option of using a `logical` vector in `j` to select columns, [#1978](https://github.com/Rdatatable/data.table/issues/1978). Thanks @Henrik-P for the note and @MichaelChirico for filing. diff --git a/R/data.table.R b/R/data.table.R index 642baed5cf..7c8a2af79e 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -2353,14 +2353,41 @@ point <- function(to, to_idx, from, from_idx) { isnull = is.null(cols) if (!isnull) cols = validate(cols, x) # NULL is default = all columns ans = .Call(Cshallowwrapper, x, cols) # copies VECSXP only - if (retain.key && isnull) return(ans) # handle most frequent case first - # rest of the cases - cols = names(x)[cols] - retain.key = retain.key && identical(cols, head(key(x), length(cols))) - setattr(ans, 'sorted', if (haskey(x) && retain.key) cols else NULL) + + if(retain.key){ + if(isnull) return(ans) # handle most frequent case first + ## get correct key if cols are present + cols = names(x)[cols] + keylength <- which.first(!key(ans) %chin% cols) - 1L + if(is.na(keylength)) keylength <- length(key(ans)) + if(!keylength){ + setattr(ans, "sorted", NULL) ## no key remaining + } else { + setattr(ans, "sorted", head(key(ans), keylength)) ## keep what can be kept + } + ## take care of attributes. + indices <- names(attributes(attr(ans, "index"))) + for(index in indices){ + indexcols <- strsplit(index, split = "__")[[1]][-1L] + indexlength <- which.first(!indexcols %chin% cols) - 1L + if(is.na(indexlength)) next ## all columns are present, nothing to be done + reducedindex <- paste0(c("", indexcols[seq_len(indexlength)]), collapse = "__") ## the columns until the first missing form the new index + if(reducedindex %chin% indices || !indexlength){ + ## Either reduced index already present or no columns of the original index remain. + ## Drop the original index completely + setattr(attr(ans, "index", exact = TRUE), index, NULL) + } else { + ## rename index to reducedindex + names(attributes(attr(ans, "index")))[names(attributes(attr(ans, "index"))) == index] <- reducedindex + } + } + } else { # retain.key == FALSE + setattr(ans, "sorted", NULL) + setattr(ans, "index", NULL) + } if (unlock) setattr(ans, '.data.table.locked', NULL) ans - # TODO: check/remove attributes for secondary keys? + } shallow <- function(x, cols=NULL) { diff --git a/R/foverlaps.R b/R/foverlaps.R index 80fa573913..b5ac466a0f 100644 --- a/R/foverlaps.R +++ b/R/foverlaps.R @@ -77,9 +77,6 @@ foverlaps <- function(x, y, by.x = if (!is.null(key(x))) key(x) else key(y), by. ## hopefully all checks are over. Now onto the actual task at hand. origx = x; x = shallow(x, by.x) origy = y; y = shallow(y, by.y) - if (identical(by.x, key(origx)[seq_along(by.x)])) - setattr(x, 'sorted', by.x) - setattr(y, 'sorted', by.y) ## is definitely sorted on by.y roll = switch(type, start=, end=, equal= 0.0, any=, within= +Inf) make_call <- function(names, fun=NULL) { diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 5dfd0534b7..33bafb435b 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -6841,11 +6841,41 @@ setkey(x1, a1, a2) test(1544.1, setDF(merge(x1, y)), merge(as.data.frame(x1), as.data.frame(y))) test(1544.2, setDF(merge(x1, y, by="a2")), merge(as.data.frame(x1), as.data.frame(y), by="a2")) # also test shallow here so as to catch future regressions -x1 <- data.table(a1 = c('a', 'b', 'c'), a2 = c(1L, 3L, 2L), key="a1,a2") +x1 <- data.table(a1 = c('a', 'b', 'c'), a2 = c(1L, 3L, 2L), a3 = c(TRUE, FALSE, TRUE), key="a1,a2") test(1545.1, key(.shallow(x1, cols="a2")), NULL) test(1545.2, key(.shallow(x1, retain.key=FALSE)), NULL) test(1545.3, key(.shallow(x1, retain.key=TRUE)), key(x1)) test(1545.4, key(.shallow(x1, cols="a1", retain.key=TRUE)), "a1") +# tests for #2336. .shallow drops keys unnecessarily +test(1545.5, key(.shallow(x1, cols=c("a1", "a3"), retain.key=TRUE)), "a1") +test(1545.6, .shallow(x1, cols=c("a3", "a1"), retain.key=TRUE), .shallow(x1, cols=c("a3", "a1"), retain.key=TRUE)) +test(1545.7, key(.shallow(x1, cols=c("a1", "a2", "a3"), retain.key=TRUE)), c("a1", "a2")) +test(1545.8, key(.shallow(x1, cols=c("a2", "a3"), retain.key=TRUE)), NULL) +test(1545.9, key(.shallow(x1, cols=c("a2"), retain.key=TRUE)), NULL) +test(1545.10, key(.shallow(x1, cols=c("a3"), retain.key=TRUE)), NULL) +# tests for #2336. .shallow now retains indices as well +x1 <- data.table(a1 = c('a', 'a', 'a', 'a', 'b', 'c'), a2 = c(2L, 2L, 1L, 2L, 3L, 2L), a3 = c(FALSE, TRUE, TRUE, FALSE, FALSE, TRUE), key="a1,a2") +setindex(x1, a1, a2, a3) +setindex(x1, a1, a3) +test(1545.11, indices(.shallow(x1, retain.key=FALSE)), NULL) +test(1545.12, indices(.shallow(x1, cols = "a1", retain.key=FALSE)), NULL) +test(1545.13, indices(.shallow(x1, retain.key=TRUE)), indices(x1)) +test(1545.14, forderv(.shallow(x1, retain.key=TRUE)[attr(attr(.shallow(x1, retain.key=TRUE), "index"), "__a1__a2__a3")], c("a1", "a2", "a3")), integer(0)) +test(1545.15, forderv(.shallow(x1, retain.key=TRUE)[attr(attr(.shallow(x1, retain.key=TRUE), "index"), "__a1__a3")], c("a1", "a3")), integer(0)) +test(1545.16, forderv(.shallow(x1, retain.key=TRUE)[attr(attr(.shallow(x1, retain.key=TRUE), "index"), "__a1")], c("a1")), integer(0)) +test(1545.17, indices(.shallow(x1, cols = "a1", retain.key=TRUE)), c("a1")) +test(1545.18, forderv(.shallow(x1, cols = "a1", retain.key=TRUE)[attr(attr(.shallow(x1, cols = "a1", retain.key=TRUE), "index"), "__a1")], c("a1")), integer(0)) +test(1545.19, attributes(attr(.shallow(x1, cols = c("a1", "a2"), retain.key = TRUE), "index", exact = TRUE)), attributes(attr(.shallow(x1, cols = c("a2", "a1"), retain.key = TRUE), "index", exact = TRUE))) +test(1545.20, indices(.shallow(x1, cols = c("a1", "a2"), retain.key=TRUE)), c("a1__a2", "a1")) +test(1545.21, forderv(.shallow(x1, cols = c("a1", "a2"), retain.key=TRUE)[attr(attr(.shallow(x1, cols = c("a1", "a2"), retain.key=TRUE), "index"), "__a1")], c("a1")), integer(0)) +test(1545.22, forderv(.shallow(x1, cols = c("a1", "a2"), retain.key=TRUE)[attr(attr(.shallow(x1, cols = c("a1", "a2"), retain.key=TRUE), "index"), "__a1__a2")], c("a1", "a2")), integer(0)) +test(1545.23, indices(.shallow(x1, cols = c("a1", "a3"), retain.key=TRUE)), c("a1", "a1__a3")) +test(1545.24, forderv(.shallow(x1, cols = c("a1", "a3"), retain.key=TRUE)[attr(attr(.shallow(x1, cols = c("a1", "a3"), retain.key=TRUE), "index"), "__a1")], c("a1")), integer(0)) +test(1545.25, forderv(.shallow(x1, cols = c("a1", "a3"), retain.key=TRUE)[attr(attr(.shallow(x1, cols = c("a1", "a3"), retain.key=TRUE), "index"), "__a1__a3")], c("a1", "a3")), integer(0)) +test(1545.26, indices(.shallow(x1, cols = c("a2", "a3"), retain.key=TRUE)), NULL) +test(1545.27, indices(.shallow(x1, cols = c("a3"), retain.key=TRUE)), NULL) +test(1545.28, .shallow(x1, cols = c("a1", "a2", "a3"), retain.key=TRUE), .shallow(x1, retain.key=TRUE)) + # test for #1234 df1 = df2 = data.frame(cats = rep(c('', ' ', 'meow'), 5)) From 7ffd8addf7c778ea2c9118be28e0235d31b33a55 Mon Sep 17 00:00:00 2001 From: "NORDEXAG\\BonschM" Date: Fri, 8 Sep 2017 19:16:40 +0200 Subject: [PATCH 2/6] Testing foverlaps problem in travis CI. --- R/foverlaps.R | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/R/foverlaps.R b/R/foverlaps.R index b5ac466a0f..f7ebb6c99e 100644 --- a/R/foverlaps.R +++ b/R/foverlaps.R @@ -77,6 +77,12 @@ foverlaps <- function(x, y, by.x = if (!is.null(key(x))) key(x) else key(y), by. ## hopefully all checks are over. Now onto the actual task at hand. origx = x; x = shallow(x, by.x) origy = y; y = shallow(y, by.y) + + if (identical(by.x, key(origx)[seq_along(by.x)])) + setattr(x, 'sorted', by.x) + setattr(y, 'sorted', by.y) ## is definitely sorted on by.y + + roll = switch(type, start=, end=, equal= 0.0, any=, within= +Inf) make_call <- function(names, fun=NULL) { From cf3bed2c7a0bfc4c09d7bd99069dd6fceac5b050 Mon Sep 17 00:00:00 2001 From: "NORDEXAG\\BonschM" Date: Fri, 8 Sep 2017 19:43:09 +0200 Subject: [PATCH 3/6] Reverted foverlaps to master. --- R/foverlaps.R | 365 +++++++++++++++++++++++++------------------------- 1 file changed, 181 insertions(+), 184 deletions(-) diff --git a/R/foverlaps.R b/R/foverlaps.R index f7ebb6c99e..0a7ad9c9ab 100644 --- a/R/foverlaps.R +++ b/R/foverlaps.R @@ -1,182 +1,179 @@ foverlaps <- function(x, y, by.x = if (!is.null(key(x))) key(x) else key(y), by.y = key(y), maxgap=0L, minoverlap=1L, type=c("any", "within", "start", "end", "equal"), mult=c("all", "first", "last"), nomatch=getOption("datatable.nomatch"), which = FALSE, verbose=getOption("datatable.verbose")) { - - if (!is.data.table(y) || !is.data.table(x)) stop("y and x must both be data.tables. Use `setDT()` to convert list/data.frames to data.tables by reference or as.data.table() to convert to data.tables by copying.") - maxgap = as.integer(maxgap); minoverlap = as.integer(minoverlap); - which = as.logical(which); nomatch = as.integer(nomatch); - if (!length(maxgap) || length(maxgap) != 1L || is.na(maxgap) || maxgap < 0L) - stop("maxgap must be a non-negative integer value of length 1") - if (!length(minoverlap) || length(minoverlap) != 1L || is.na(minoverlap) || minoverlap < 1L) - stop("minoverlap must be a positive integer value of length 1") - if (!length(which) || length(which) != 1L || is.na(which)) - stop("which must be a logical vector of length 1. Either TRUE/FALSE") - if (!length(nomatch) || length(nomatch) != 1L || (!is.na(nomatch) && nomatch!=0L)) - stop("nomatch must either be NA or 0, or (ideally) NA_integer_ or 0L") - type = match.arg(type) - mult = match.arg(mult) - if (type == "equal") - stop("type = 'equal' is not implemented yet. But note that this is just the same as a normal data.table join y[x, ...], unless you are also interested in setting 'minoverlap / maxgap' arguments. But those arguments are not implemented yet as well.") - if (maxgap > 0L || minoverlap > 1L) - stop("maxgap and minoverlap arguments are not yet implemented.") - if (is.null(by.y)) - stop("'y' must be keyed (i.e., sorted, and, marked as sorted). Call setkey(y, ...) first, see ?setkey. Also check the examples in ?foverlaps.") - if (length(by.x) < 2L || length(by.y) < 2L) - stop("'by.x' and 'by.y' should contain at least two column names (or numbers) each - corresponding to 'start' and 'end' points of intervals. Please see ?foverlaps and examples for more info.") - if (is.numeric(by.x)) { - if (any(by.x < 0L) || any(by.x > length(x))) - stop("Invalid numeric value for 'by.x'; it should be a vector with values 1 <= by.x <= length(x)") - by.x = names(x)[by.x] - } - if (is.numeric(by.y)) { - if (any(by.y < 0L) || any(by.y > length(y))) - stop("Invalid numeric value for 'by.x'; it should be a vector with values 1 <= by.y <= length(y)") - by.y = names(y)[by.y] - } - if (!length(by.x) || !is.character(by.x)) - stop("A non-empty vector of column names is required for by.x") - if (!length(by.y) || !is.character(by.y)) - stop("A non-empty vector of column names is required for by.y") - if (!identical(by.y, key(y)[seq_along(by.y)])) - stop("The first ", length(by.y), " columns of y's key is not identical to the columns specified in by.y.") - if (any(is.na(chmatch(by.x, names(x))))) - stop("Elements listed in 'by.x' must be valid names in data.table 'x'") - if (any(is.na(chmatch(by.y, names(y))))) - stop("Elements listed in 'by.y' must be valid names in data.table 'y'") - if (anyDuplicated(by.x) || anyDuplicated(by.y)) - stop("Duplicate columns are not allowed in overlap joins. This may change in the future.") - if (length(by.x) != length(by.y)) - stop("length(by.x) != length(by.y). Columns specified in by.x should correspond to columns specified in by.y and should be of same lengths.") - if (any(dup.x<-duplicated(names(x)))) #1730 - handling join possible but would require workarounds on setcolorder further, it is really better just to rename dup column - stop("x has some duplicated column name(s): ",paste(unique(names(x)[dup.x]),collapse=","),". Please remove or rename the duplicate(s) and try again.") - if (any(dup.y<-duplicated(names(y)))) - stop("y has some duplicated column name(s): ",paste(unique(names(y)[dup.y]),collapse=","),". Please remove or rename the duplicate(s) and try again.") - - xnames = by.x; xintervals = tail(xnames, 2L); - ynames = by.y; yintervals = tail(ynames, 2L); - if (!storage.mode(x[[xintervals[1L]]]) %chin% c("double", "integer") || !storage.mode(x[[xintervals[2L]]]) %chin% c("double", "integer")) - stop("The last two columns in by.x should correspond to the 'start' and 'end' intervals in data.table 'x' and must be integer/numeric type.") - if ( any(x[[xintervals[2L]]] - x[[xintervals[1L]]] < 0L) ) - stop("All entries in column ", xintervals[1L], " should be <= corresponding entries in column ", xintervals[2L], " in data.table 'x'") - if (!storage.mode(y[[yintervals[1L]]]) %chin% c("double", "integer") || !storage.mode(y[[yintervals[2L]]]) %chin% c("double", "integer")) - stop("The last two columns in by.y should correspond to the 'start' and 'end' intervals in data.table 'y' and must be integer/numeric type.") - if ( any(y[[yintervals[2L]]] - y[[yintervals[1L]]] < 0L) ) - stop("All entries in column ", yintervals[1L], " should be <= corresponding entries in column ", yintervals[2L], " in data.table 'y'") - ## see NOTES below: - yclass = c(class(y[[yintervals[1L]]]), class(y[[yintervals[2L]]])) - isdouble = FALSE; isposix = FALSE - if ( any(c("numeric", "POSIXct") %chin% yclass) ) { - # next representive double > x under the given precision (48,56 or 64-bit in data.table) = x*incr - dt_eps <- function() { - bits = floor(log2(.Machine$double.eps)) - 2 ^ (bits + (getNumericRounding() * 8L)) - } - incr = 1 + dt_eps() - isdouble = TRUE - isposix = "POSIXct" %chin% yclass - } else incr = 1L # integer or Date class for example - - ## hopefully all checks are over. Now onto the actual task at hand. - origx = x; x = shallow(x, by.x) - origy = y; y = shallow(y, by.y) - - if (identical(by.x, key(origx)[seq_along(by.x)])) - setattr(x, 'sorted', by.x) - setattr(y, 'sorted', by.y) ## is definitely sorted on by.y - - - roll = switch(type, start=, end=, equal= 0.0, - any=, within= +Inf) - make_call <- function(names, fun=NULL) { - if (is.character(names)) - names = lapply(names, as.name) - call = c(substitute(fun, list(fun=fun)), names) - if (!is.null(fun)) as.call(call) else call + + if (!is.data.table(y) || !is.data.table(x)) stop("y and x must both be data.tables. Use `setDT()` to convert list/data.frames to data.tables by reference or as.data.table() to convert to data.tables by copying.") + maxgap = as.integer(maxgap); minoverlap = as.integer(minoverlap); + which = as.logical(which); nomatch = as.integer(nomatch); + if (!length(maxgap) || length(maxgap) != 1L || is.na(maxgap) || maxgap < 0L) + stop("maxgap must be a non-negative integer value of length 1") + if (!length(minoverlap) || length(minoverlap) != 1L || is.na(minoverlap) || minoverlap < 1L) + stop("minoverlap must be a positive integer value of length 1") + if (!length(which) || length(which) != 1L || is.na(which)) + stop("which must be a logical vector of length 1. Either TRUE/FALSE") + if (!length(nomatch) || length(nomatch) != 1L || (!is.na(nomatch) && nomatch!=0L)) + stop("nomatch must either be NA or 0, or (ideally) NA_integer_ or 0L") + type = match.arg(type) + mult = match.arg(mult) + if (type == "equal") + stop("type = 'equal' is not implemented yet. But note that this is just the same as a normal data.table join y[x, ...], unless you are also interested in setting 'minoverlap / maxgap' arguments. But those arguments are not implemented yet as well.") + if (maxgap > 0L || minoverlap > 1L) + stop("maxgap and minoverlap arguments are not yet implemented.") + if (is.null(by.y)) + stop("'y' must be keyed (i.e., sorted, and, marked as sorted). Call setkey(y, ...) first, see ?setkey. Also check the examples in ?foverlaps.") + if (length(by.x) < 2L || length(by.y) < 2L) + stop("'by.x' and 'by.y' should contain at least two column names (or numbers) each - corresponding to 'start' and 'end' points of intervals. Please see ?foverlaps and examples for more info.") + if (is.numeric(by.x)) { + if (any(by.x < 0L) || any(by.x > length(x))) + stop("Invalid numeric value for 'by.x'; it should be a vector with values 1 <= by.x <= length(x)") + by.x = names(x)[by.x] + } + if (is.numeric(by.y)) { + if (any(by.y < 0L) || any(by.y > length(y))) + stop("Invalid numeric value for 'by.x'; it should be a vector with values 1 <= by.y <= length(y)") + by.y = names(y)[by.y] + } + if (!length(by.x) || !is.character(by.x)) + stop("A non-empty vector of column names is required for by.x") + if (!length(by.y) || !is.character(by.y)) + stop("A non-empty vector of column names is required for by.y") + if (!identical(by.y, key(y)[seq_along(by.y)])) + stop("The first ", length(by.y), " columns of y's key is not identical to the columns specified in by.y.") + if (any(is.na(chmatch(by.x, names(x))))) + stop("Elements listed in 'by.x' must be valid names in data.table 'x'") + if (any(is.na(chmatch(by.y, names(y))))) + stop("Elements listed in 'by.y' must be valid names in data.table 'y'") + if (anyDuplicated(by.x) || anyDuplicated(by.y)) + stop("Duplicate columns are not allowed in overlap joins. This may change in the future.") + if (length(by.x) != length(by.y)) + stop("length(by.x) != length(by.y). Columns specified in by.x should correspond to columns specified in by.y and should be of same lengths.") + if (any(dup.x<-duplicated(names(x)))) #1730 - handling join possible but would require workarounds on setcolorder further, it is really better just to rename dup column + stop("x has some duplicated column name(s): ",paste(unique(names(x)[dup.x]),collapse=","),". Please remove or rename the duplicate(s) and try again.") + if (any(dup.y<-duplicated(names(y)))) + stop("y has some duplicated column name(s): ",paste(unique(names(y)[dup.y]),collapse=","),". Please remove or rename the duplicate(s) and try again.") + + xnames = by.x; xintervals = tail(xnames, 2L); + ynames = by.y; yintervals = tail(ynames, 2L); + if (!storage.mode(x[[xintervals[1L]]]) %chin% c("double", "integer") || !storage.mode(x[[xintervals[2L]]]) %chin% c("double", "integer")) + stop("The last two columns in by.x should correspond to the 'start' and 'end' intervals in data.table 'x' and must be integer/numeric type.") + if ( any(x[[xintervals[2L]]] - x[[xintervals[1L]]] < 0L) ) + stop("All entries in column ", xintervals[1L], " should be <= corresponding entries in column ", xintervals[2L], " in data.table 'x'") + if (!storage.mode(y[[yintervals[1L]]]) %chin% c("double", "integer") || !storage.mode(y[[yintervals[2L]]]) %chin% c("double", "integer")) + stop("The last two columns in by.y should correspond to the 'start' and 'end' intervals in data.table 'y' and must be integer/numeric type.") + if ( any(y[[yintervals[2L]]] - y[[yintervals[1L]]] < 0L) ) + stop("All entries in column ", yintervals[1L], " should be <= corresponding entries in column ", yintervals[2L], " in data.table 'y'") + ## see NOTES below: + yclass = c(class(y[[yintervals[1L]]]), class(y[[yintervals[2L]]])) + isdouble = FALSE; isposix = FALSE + if ( any(c("numeric", "POSIXct") %chin% yclass) ) { + # next representive double > x under the given precision (48,56 or 64-bit in data.table) = x*incr + dt_eps <- function() { + bits = floor(log2(.Machine$double.eps)) + 2 ^ (bits + (getNumericRounding() * 8L)) } - construct <- function(icols, mcols, type=type) { - icall = make_call(icols) - setattr(icall, 'names', icols) - mcall = make_call(mcols, quote(c)) - if (type %chin% c("within", "any")) { - mcall[[3L]] = substitute( - if (isposix) unclass(val)*incr # incr is okay since this won't be negative - else if (isdouble) { - # fix for #1006 - 0.0 occurs in both start and end - # better fix for 0.0, and other -ves. can't use 'incr' - # hopefully this doesn't open another can of worms - (val+dt_eps())*(1 + sign(val)*dt_eps()) - } - else val+incr, - list(val = mcall[[3L]], incr = incr)) + incr = 1 + dt_eps() + isdouble = TRUE + isposix = "POSIXct" %chin% yclass + } else incr = 1L # integer or Date class for example + + ## hopefully all checks are over. Now onto the actual task at hand. + origx = x; x = shallow(x, by.x) + origy = y; y = shallow(y, by.y) + if (identical(by.x, key(origx)[seq_along(by.x)])) + setattr(x, 'sorted', by.x) + setattr(y, 'sorted', by.y) ## is definitely sorted on by.y + roll = switch(type, start=, end=, equal= 0.0, + any=, within= +Inf) + make_call <- function(names, fun=NULL) { + if (is.character(names)) + names = lapply(names, as.name) + call = c(substitute(fun, list(fun=fun)), names) + if (!is.null(fun)) as.call(call) else call + } + construct <- function(icols, mcols, type=type) { + icall = make_call(icols) + setattr(icall, 'names', icols) + mcall = make_call(mcols, quote(c)) + if (type %chin% c("within", "any")) { + mcall[[3L]] = substitute( + if (isposix) unclass(val)*incr # incr is okay since this won't be negative + else if (isdouble) { + # fix for #1006 - 0.0 occurs in both start and end + # better fix for 0.0, and other -ves. can't use 'incr' + # hopefully this doesn't open another can of worms + (val+dt_eps())*(1 + sign(val)*dt_eps()) } - make_call(c(icall, pos=mcall), quote(list)) + else val+incr, + list(val = mcall[[3L]], incr = incr)) } - uycols = switch(type, start = yintervals[1L], - end = yintervals[2L], any =, - within =, equal = yintervals) - call = construct(head(ynames, -2L), uycols, type) - if (verbose) {last.started.at=proc.time()[3];cat("unique() + setkey() operations done in ...");flush.console()} - uy = unique(y[, eval(call)]) - setkey(uy)[, `:=`(lookup = list(list(integer(0))), type_lookup = list(list(integer(0))), count=0L, type_count=0L)] - if (verbose) {cat(round(proc.time()[3]-last.started.at,3),"secs\n");flush.console} - matches <- function(ii, xx, del, ...) { - cols = setdiff(names(xx), del) - xx = shallow(xx, cols) - ans = bmerge(xx, ii, seq_along(xx), seq_along(xx), haskey(xx), integer(0), mult=mult, ops=rep(1L, length(xx)), integer(0), 1L, verbose=verbose, ...) - # vecseq part should never run here, but still... - if (ans$allLen1) ans$starts else vecseq(ans$starts, ans$lens, NULL) - } - indices <- function(x, y, intervals, ...) { - if (type == "start") { - sidx = eidx = matches(x, y, intervals[2L], rollends=c(FALSE,FALSE), ...) ## TODO: eidx can be set to integer(0) - } else if (type == "end") { - eidx = sidx = matches(x, y, intervals[1L], rollends=c(FALSE,FALSE), ...) ## TODO: sidx can be set to integer(0) - } else { - sidx = matches(x, y, intervals[2L], rollends=rep(type == "any", 2L), ...) - eidx = matches(x, y, intervals[1L], rollends=c(FALSE,TRUE), ...) - } - list(sidx, eidx) - } - # nomatch has no effect here, just for passing arguments consistently to `bmerge` - .Call(Clookup, uy, nrow(y), indices(uy, y, yintervals, nomatch=0L, roll=roll), maxgap, minoverlap, mult, type, verbose) - if (maxgap == 0L && minoverlap == 1L) { - iintervals = tail(names(x), 2L) - if (verbose) {last.started.at=proc.time()[3];cat("binary search(es) done in ...");flush.console()} - xmatches = indices(uy, x, xintervals, nomatch=0L, roll=roll) - if (verbose) {cat(round(proc.time()[3]-last.started.at,3),"secs\n");flush.console} - olaps = .Call(Coverlaps, uy, xmatches, mult, type, nomatch, verbose) - } else if (maxgap == 0L && minoverlap > 1L) { - stop("Not yet implemented") - } else if (maxgap > 0L && minoverlap == 1L) { - stop("Not yet implemented") - } else if (maxgap > 0L && minoverlap > 1L) { - if (maxgap > minoverlap) - warning("maxgap > minoverlap. maxgap will have no effect here.") - stop("Not yet implemented") - } - setDT(olaps) - setnames(olaps, c("xid", "yid")) - yid = NULL # for 'no visible binding for global variable' from R CMD check on i clauses below - # if (type == "any") setorder(olaps) # at times the combine operation may not result in sorted order - # CsubsetDT bug has been fixed by Matt. So back to using it! Should improve subset substantially. - if (which) { - if (mult %chin% c("first", "last")) - return (olaps$yid) - else if (!is.na(nomatch)) - return (.Call(CsubsetDT, olaps, which(olaps$yid > 0L), seq_along(olaps))) - else return (olaps) + make_call(c(icall, pos=mcall), quote(list)) + } + uycols = switch(type, start = yintervals[1L], + end = yintervals[2L], any =, + within =, equal = yintervals) + call = construct(head(ynames, -2L), uycols, type) + if (verbose) {last.started.at=proc.time()[3];cat("unique() + setkey() operations done in ...");flush.console()} + uy = unique(y[, eval(call)]) + setkey(uy)[, `:=`(lookup = list(list(integer(0))), type_lookup = list(list(integer(0))), count=0L, type_count=0L)] + if (verbose) {cat(round(proc.time()[3]-last.started.at,3),"secs\n");flush.console} + matches <- function(ii, xx, del, ...) { + cols = setdiff(names(xx), del) + xx = shallow(xx, cols) + ans = bmerge(xx, ii, seq_along(xx), seq_along(xx), haskey(xx), integer(0), mult=mult, ops=rep(1L, length(xx)), integer(0), 1L, verbose=verbose, ...) + # vecseq part should never run here, but still... + if (ans$allLen1) ans$starts else vecseq(ans$starts, ans$lens, NULL) + } + indices <- function(x, y, intervals, ...) { + if (type == "start") { + sidx = eidx = matches(x, y, intervals[2L], rollends=c(FALSE,FALSE), ...) ## TODO: eidx can be set to integer(0) + } else if (type == "end") { + eidx = sidx = matches(x, y, intervals[1L], rollends=c(FALSE,FALSE), ...) ## TODO: sidx can be set to integer(0) } else { - if (!is.na(nomatch)) - olaps = .Call(CsubsetDT, olaps, which(olaps$yid > 0L), seq_along(olaps)) - ycols = setdiff(names(origy), head(by.y, -2L)) - idx = chmatch(ycols, names(origx), nomatch=0L) - ans = .Call(CsubsetDT, origx, olaps$xid, seq_along(origx)) - if (any(idx>0L)) - setnames(ans, names(ans)[idx], paste("i.", names(ans)[idx], sep="")) - xcols1 = head(by.x, -2L) - xcols2 = setdiff(names(ans), xcols1) - ans[, (ycols) := .Call(CsubsetDT, origy, olaps$yid, chmatch(ycols, names(origy)))] - setcolorder(ans, c(xcols1, ycols, xcols2)) - return (ans[]) + sidx = matches(x, y, intervals[2L], rollends=rep(type == "any", 2L), ...) + eidx = matches(x, y, intervals[1L], rollends=c(FALSE,TRUE), ...) } + list(sidx, eidx) + } + # nomatch has no effect here, just for passing arguments consistently to `bmerge` + .Call(Clookup, uy, nrow(y), indices(uy, y, yintervals, nomatch=0L, roll=roll), maxgap, minoverlap, mult, type, verbose) + if (maxgap == 0L && minoverlap == 1L) { + iintervals = tail(names(x), 2L) + if (verbose) {last.started.at=proc.time()[3];cat("binary search(es) done in ...");flush.console()} + xmatches = indices(uy, x, xintervals, nomatch=0L, roll=roll) + if (verbose) {cat(round(proc.time()[3]-last.started.at,3),"secs\n");flush.console} + olaps = .Call(Coverlaps, uy, xmatches, mult, type, nomatch, verbose) + } else if (maxgap == 0L && minoverlap > 1L) { + stop("Not yet implemented") + } else if (maxgap > 0L && minoverlap == 1L) { + stop("Not yet implemented") + } else if (maxgap > 0L && minoverlap > 1L) { + if (maxgap > minoverlap) + warning("maxgap > minoverlap. maxgap will have no effect here.") + stop("Not yet implemented") + } + setDT(olaps) + setnames(olaps, c("xid", "yid")) + yid = NULL # for 'no visible binding for global variable' from R CMD check on i clauses below + # if (type == "any") setorder(olaps) # at times the combine operation may not result in sorted order + # CsubsetDT bug has been fixed by Matt. So back to using it! Should improve subset substantially. + if (which) { + if (mult %chin% c("first", "last")) + return (olaps$yid) + else if (!is.na(nomatch)) + return (.Call(CsubsetDT, olaps, which(olaps$yid > 0L), seq_along(olaps))) + else return (olaps) + } else { + if (!is.na(nomatch)) + olaps = .Call(CsubsetDT, olaps, which(olaps$yid > 0L), seq_along(olaps)) + ycols = setdiff(names(origy), head(by.y, -2L)) + idx = chmatch(ycols, names(origx), nomatch=0L) + ans = .Call(CsubsetDT, origx, olaps$xid, seq_along(origx)) + if (any(idx>0L)) + setnames(ans, names(ans)[idx], paste("i.", names(ans)[idx], sep="")) + xcols1 = head(by.x, -2L) + xcols2 = setdiff(names(ans), xcols1) + ans[, (ycols) := .Call(CsubsetDT, origy, olaps$yid, chmatch(ycols, names(origy)))] + setcolorder(ans, c(xcols1, ycols, xcols2)) + return (ans[]) + } } ## Notes: (If there's a better way than the solution I propose here, I'd be glad to apply it.) @@ -187,10 +184,10 @@ foverlaps <- function(x, y, by.x = if (!is.null(key(x))) key(x) else key(y), by. # --- # Firstly, assuming 64-bit precision, we can't simply add 1L. For e.g., consider: - # x = data.table(start=0.88, end=0.88) - # y = data.table(start=0.26, end=0.61, key=c("start", "end")) +# x = data.table(start=0.88, end=0.88) +# y = data.table(start=0.26, end=0.61, key=c("start", "end")) # and we'd like to do: - # foverlaps(x, y, type="any") +# foverlaps(x, y, type="any") # Adding 1 to 0.61 will result in 1.61, and will make sure 0.88 falls between 0.26 and 1.61, and that's wrong! # POSIXct objects are internally numeric as well. @@ -203,19 +200,19 @@ foverlaps <- function(x, y, by.x = if (!is.null(key(x))) key(x) else key(y), by. # Simple again. It fixes the problem, for now (read on). # NOTE THAT simply doing ("end" + .Machine$double.eps ^ 0.5) is insufficient because this doesn't work as the numbers grow bigger. For e.g., try: - # identical(3e8, 3e8+.Machine$double.eps^0.5) - # identical(3e8, 3e8*(1+.Machine$double.eps)) +# identical(3e8, 3e8+.Machine$double.eps^0.5) +# identical(3e8, 3e8*(1+.Machine$double.eps)) # The important point is that we **need to be ABSOLUTELY sure** that "end coordinate" gets incremented to it's *next representative number*. Why? Consider a 'subject' interval [4,4]. When we collapse this to get the 1D-form and take `unique`, if the "end" coordinate is not distinguishable from the start coordinate, `unique` will return just one value. And this brings us back to square-one (the reason why we needed to add one in the first place!!). For example, consider query = c(5,5) and subject = c(4,4). Now, after collapsing and taking unique, if we end up with just one 4, then performing the join later will result in [5,5] actually matching 4 whose lookup will have valid indices and not NULL resulting in an incorrect overlap. We absolutely need the second 4, and we want it to be greater than the start 4, but just the smallest separation between them possible (so that we don't miss any other numbers that fall in that range). # For POSIXct objects, we could still do the same. But a) multiplication is not supported for POSIXt objects, so we'll have to unclass it, multiply and convert back.. which is not ideal - timezone, time spent on conversion etc.. and b) all.equal in base considers a tolerance of 0.001 for POSIXt objects, I'm guessing this is for "millisecond" resolution? The problem with (b) is that more than millisecond resolution will return incorrect results again. - # # More than millisecond resolution. Results are not stable. - # tt = c( as.POSIXct('2011-10-11 07:49:36.0003'), as.POSIXct('2011-10-11 07:49:36.0199'), as.POSIXct('2011-10-11 07:49:36.0399')) - # DT1 = data.table(start=tt, end=tt) - # DT2 = data.table(start=tt[2], end=tt[2]) - # setkey(DT2) - # foverlaps(DT1, DT2, which=TRUE) +# # More than millisecond resolution. Results are not stable. +# tt = c( as.POSIXct('2011-10-11 07:49:36.0003'), as.POSIXct('2011-10-11 07:49:36.0199'), as.POSIXct('2011-10-11 07:49:36.0399')) +# DT1 = data.table(start=tt, end=tt) +# DT2 = data.table(start=tt[2], end=tt[2]) +# setkey(DT2) +# foverlaps(DT1, DT2, which=TRUE) # So, to put an end to this problem, we'll unclass it, multiply and convert back. In any case, the join does not depend on the timezone, as the internal numeric equivalent seems to be identical irrespective of the time zones.. So that's good news! # --- From 4d30c794736d92407b1e120e5c537afa0a4afac8 Mon Sep 17 00:00:00 2001 From: "NORDEXAG\\BonschM" Date: Sat, 9 Sep 2017 06:12:30 +0200 Subject: [PATCH 4/6] Better unit tests for .shallow. --- inst/tests/tests.Rraw | 53 ++++++++++++++++++++++++++++--------------- 1 file changed, 35 insertions(+), 18 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 75da370ac4..08e5f1e359 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -6855,28 +6855,45 @@ test(1545.7, key(.shallow(x1, cols=c("a1", "a2", "a3"), retain.key=TRUE)), c("a test(1545.8, key(.shallow(x1, cols=c("a2", "a3"), retain.key=TRUE)), NULL) test(1545.9, key(.shallow(x1, cols=c("a2"), retain.key=TRUE)), NULL) test(1545.10, key(.shallow(x1, cols=c("a3"), retain.key=TRUE)), NULL) +setkey(x1, NULL) +test(1545.11, key(.shallow(x1, retain.key=TRUE)), NULL) +test(1545.12, key(.shallow(x1, cols=c("a1", "a2"), retain.key=TRUE)), NULL) +x1 <- x1[0] +test(1545.13, key(.shallow(x1, retain.key=TRUE)), NULL) +test(1545.131, key(.shallow(x1, retain.key=FALSE)), NULL) +test(1545.132, key(.shallow(x1, cols = c("a1"), retain.key=FALSE)), NULL) +test(1545.133, key(.shallow(x1, cols=c("a1", "a2"), retain.key=TRUE)), NULL) +setkey(x1, a1) +test(1545.134, key(.shallow(x1, retain.key=FALSE)), NULL) +test(1545.135, key(.shallow(x1, cols = "a2", retain.key=FALSE)), NULL) +test(1545.136, key(.shallow(x1, retain.key=TRUE)), "a1") +test(1545.137, key(.shallow(x1, cols=c("a1", "a2"), retain.key=TRUE)), "a1") +test(1545.138, key(.shallow(x1, cols=c("a3"), retain.key=TRUE)), NULL) +setkey(x1, NULL) + +test(1545.136, key(.shallow(x1, retain.key=FALSE)), NULL) # tests for #2336. .shallow now retains indices as well x1 <- data.table(a1 = c('a', 'a', 'a', 'a', 'b', 'c'), a2 = c(2L, 2L, 1L, 2L, 3L, 2L), a3 = c(FALSE, TRUE, TRUE, FALSE, FALSE, TRUE), key="a1,a2") setindex(x1, a1, a2, a3) setindex(x1, a1, a3) -test(1545.11, indices(.shallow(x1, retain.key=FALSE)), NULL) -test(1545.12, indices(.shallow(x1, cols = "a1", retain.key=FALSE)), NULL) -test(1545.13, indices(.shallow(x1, retain.key=TRUE)), indices(x1)) -test(1545.14, forderv(.shallow(x1, retain.key=TRUE)[attr(attr(.shallow(x1, retain.key=TRUE), "index"), "__a1__a2__a3")], c("a1", "a2", "a3")), integer(0)) -test(1545.15, forderv(.shallow(x1, retain.key=TRUE)[attr(attr(.shallow(x1, retain.key=TRUE), "index"), "__a1__a3")], c("a1", "a3")), integer(0)) -test(1545.16, forderv(.shallow(x1, retain.key=TRUE)[attr(attr(.shallow(x1, retain.key=TRUE), "index"), "__a1")], c("a1")), integer(0)) -test(1545.17, indices(.shallow(x1, cols = "a1", retain.key=TRUE)), c("a1")) -test(1545.18, forderv(.shallow(x1, cols = "a1", retain.key=TRUE)[attr(attr(.shallow(x1, cols = "a1", retain.key=TRUE), "index"), "__a1")], c("a1")), integer(0)) -test(1545.19, attributes(attr(.shallow(x1, cols = c("a1", "a2"), retain.key = TRUE), "index", exact = TRUE)), attributes(attr(.shallow(x1, cols = c("a2", "a1"), retain.key = TRUE), "index", exact = TRUE))) -test(1545.20, indices(.shallow(x1, cols = c("a1", "a2"), retain.key=TRUE)), c("a1__a2", "a1")) -test(1545.21, forderv(.shallow(x1, cols = c("a1", "a2"), retain.key=TRUE)[attr(attr(.shallow(x1, cols = c("a1", "a2"), retain.key=TRUE), "index"), "__a1")], c("a1")), integer(0)) -test(1545.22, forderv(.shallow(x1, cols = c("a1", "a2"), retain.key=TRUE)[attr(attr(.shallow(x1, cols = c("a1", "a2"), retain.key=TRUE), "index"), "__a1__a2")], c("a1", "a2")), integer(0)) -test(1545.23, indices(.shallow(x1, cols = c("a1", "a3"), retain.key=TRUE)), c("a1", "a1__a3")) -test(1545.24, forderv(.shallow(x1, cols = c("a1", "a3"), retain.key=TRUE)[attr(attr(.shallow(x1, cols = c("a1", "a3"), retain.key=TRUE), "index"), "__a1")], c("a1")), integer(0)) -test(1545.25, forderv(.shallow(x1, cols = c("a1", "a3"), retain.key=TRUE)[attr(attr(.shallow(x1, cols = c("a1", "a3"), retain.key=TRUE), "index"), "__a1__a3")], c("a1", "a3")), integer(0)) -test(1545.26, indices(.shallow(x1, cols = c("a2", "a3"), retain.key=TRUE)), NULL) -test(1545.27, indices(.shallow(x1, cols = c("a3"), retain.key=TRUE)), NULL) -test(1545.28, .shallow(x1, cols = c("a1", "a2", "a3"), retain.key=TRUE), .shallow(x1, retain.key=TRUE)) +test(1545.15, indices(.shallow(x1, retain.key=FALSE)), NULL) +test(1545.16, indices(.shallow(x1, cols = "a1", retain.key=FALSE)), NULL) +test(1545.17, indices(.shallow(x1, retain.key=TRUE)), indices(x1)) +test(1545.18, forderv(.shallow(x1, retain.key=TRUE)[attr(attr(.shallow(x1, retain.key=TRUE), "index"), "__a1__a2__a3")], c("a1", "a2", "a3")), integer(0)) +test(1545.19, forderv(.shallow(x1, retain.key=TRUE)[attr(attr(.shallow(x1, retain.key=TRUE), "index"), "__a1__a3")], c("a1", "a3")), integer(0)) +test(1545.20, forderv(.shallow(x1, retain.key=TRUE)[attr(attr(.shallow(x1, retain.key=TRUE), "index"), "__a1")], c("a1")), integer(0)) +test(1545.21, indices(.shallow(x1, cols = "a1", retain.key=TRUE)), c("a1")) +test(1545.22, forderv(.shallow(x1, cols = "a1", retain.key=TRUE)[attr(attr(.shallow(x1, cols = "a1", retain.key=TRUE), "index"), "__a1")], c("a1")), integer(0)) +test(1545.23, attributes(attr(.shallow(x1, cols = c("a1", "a2"), retain.key = TRUE), "index", exact = TRUE)), attributes(attr(.shallow(x1, cols = c("a2", "a1"), retain.key = TRUE), "index", exact = TRUE))) +test(1545.24, indices(.shallow(x1, cols = c("a1", "a2"), retain.key=TRUE)), c("a1__a2", "a1")) +test(1545.25, forderv(.shallow(x1, cols = c("a1", "a2"), retain.key=TRUE)[attr(attr(.shallow(x1, cols = c("a1", "a2"), retain.key=TRUE), "index"), "__a1")], c("a1")), integer(0)) +test(1545.26, forderv(.shallow(x1, cols = c("a1", "a2"), retain.key=TRUE)[attr(attr(.shallow(x1, cols = c("a1", "a2"), retain.key=TRUE), "index"), "__a1__a2")], c("a1", "a2")), integer(0)) +test(1545.27, indices(.shallow(x1, cols = c("a1", "a3"), retain.key=TRUE)), c("a1", "a1__a3")) +test(1545.28, forderv(.shallow(x1, cols = c("a1", "a3"), retain.key=TRUE)[attr(attr(.shallow(x1, cols = c("a1", "a3"), retain.key=TRUE), "index"), "__a1")], c("a1")), integer(0)) +test(1545.29, forderv(.shallow(x1, cols = c("a1", "a3"), retain.key=TRUE)[attr(attr(.shallow(x1, cols = c("a1", "a3"), retain.key=TRUE), "index"), "__a1__a3")], c("a1", "a3")), integer(0)) +test(1545.30, indices(.shallow(x1, cols = c("a2", "a3"), retain.key=TRUE)), NULL) +test(1545.31, indices(.shallow(x1, cols = c("a3"), retain.key=TRUE)), NULL) +test(1545.32, .shallow(x1, cols = c("a1", "a2", "a3"), retain.key=TRUE), .shallow(x1, retain.key=TRUE)) # test for #1234 From 6e8eb7f61edf46430cd8e250ffb866acb0ef6ab5 Mon Sep 17 00:00:00 2001 From: "NORDEXAG\\BonschM" Date: Sat, 9 Sep 2017 07:50:55 +0200 Subject: [PATCH 5/6] Finished implementatio: bugfix in foverlaps.R and newest tests. --- R/foverlaps.R | 5 +---- inst/tests/tests.Rraw | 5 +++-- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/R/foverlaps.R b/R/foverlaps.R index 0a7ad9c9ab..f8f148ae75 100644 --- a/R/foverlaps.R +++ b/R/foverlaps.R @@ -77,9 +77,6 @@ foverlaps <- function(x, y, by.x = if (!is.null(key(x))) key(x) else key(y), by. ## hopefully all checks are over. Now onto the actual task at hand. origx = x; x = shallow(x, by.x) origy = y; y = shallow(y, by.y) - if (identical(by.x, key(origx)[seq_along(by.x)])) - setattr(x, 'sorted', by.x) - setattr(y, 'sorted', by.y) ## is definitely sorted on by.y roll = switch(type, start=, end=, equal= 0.0, any=, within= +Inf) make_call <- function(names, fun=NULL) { @@ -116,7 +113,7 @@ foverlaps <- function(x, y, by.x = if (!is.null(key(x))) key(x) else key(y), by. if (verbose) {cat(round(proc.time()[3]-last.started.at,3),"secs\n");flush.console} matches <- function(ii, xx, del, ...) { cols = setdiff(names(xx), del) - xx = shallow(xx, cols) + xx = .shallow(xx, cols, retain.key = FALSE) ans = bmerge(xx, ii, seq_along(xx), seq_along(xx), haskey(xx), integer(0), mult=mult, ops=rep(1L, length(xx)), integer(0), 1L, verbose=verbose, ...) # vecseq part should never run here, but still... if (ans$allLen1) ans$starts else vecseq(ans$starts, ans$lens, NULL) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 08e5f1e359..8001a2b56b 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -6846,6 +6846,7 @@ test(1544.2, setDF(merge(x1, y, by="a2")), merge(as.data.frame(x1), as.data.fram x1 <- data.table(a1 = c('a', 'b', 'c'), a2 = c(1L, 3L, 2L), a3 = c(TRUE, FALSE, TRUE), key="a1,a2") test(1545.1, key(.shallow(x1, cols="a2")), NULL) test(1545.2, key(.shallow(x1, retain.key=FALSE)), NULL) +test(1545.2, key(.shallow(x1, cols = "a1", retain.key=FALSE)), NULL) test(1545.3, key(.shallow(x1, retain.key=TRUE)), key(x1)) test(1545.4, key(.shallow(x1, cols="a1", retain.key=TRUE)), "a1") # tests for #2336. .shallow drops keys unnecessarily @@ -6857,7 +6858,9 @@ test(1545.9, key(.shallow(x1, cols=c("a2"), retain.key=TRUE)), NULL) test(1545.10, key(.shallow(x1, cols=c("a3"), retain.key=TRUE)), NULL) setkey(x1, NULL) test(1545.11, key(.shallow(x1, retain.key=TRUE)), NULL) +test(1545.111, key(.shallow(x1, retain.key=FALSE)), NULL) test(1545.12, key(.shallow(x1, cols=c("a1", "a2"), retain.key=TRUE)), NULL) +test(1545.121, key(.shallow(x1, cols=c("a1", "a2"), retain.key=FALSE)), NULL) x1 <- x1[0] test(1545.13, key(.shallow(x1, retain.key=TRUE)), NULL) test(1545.131, key(.shallow(x1, retain.key=FALSE)), NULL) @@ -6869,9 +6872,7 @@ test(1545.135, key(.shallow(x1, cols = "a2", retain.key=FALSE)), NULL) test(1545.136, key(.shallow(x1, retain.key=TRUE)), "a1") test(1545.137, key(.shallow(x1, cols=c("a1", "a2"), retain.key=TRUE)), "a1") test(1545.138, key(.shallow(x1, cols=c("a3"), retain.key=TRUE)), NULL) -setkey(x1, NULL) -test(1545.136, key(.shallow(x1, retain.key=FALSE)), NULL) # tests for #2336. .shallow now retains indices as well x1 <- data.table(a1 = c('a', 'a', 'a', 'a', 'b', 'c'), a2 = c(2L, 2L, 1L, 2L, 3L, 2L), a3 = c(FALSE, TRUE, TRUE, FALSE, FALSE, TRUE), key="a1,a2") setindex(x1, a1, a2, a3) From 52622cc753d2634ebd94e4a88eb98b341eea250e Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Mon, 11 Sep 2017 12:47:35 -0700 Subject: [PATCH 6/6] News item refined --- NEWS.md | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/NEWS.md b/NEWS.md index 900e580aa2..2faadd17b4 100644 --- a/NEWS.md +++ b/NEWS.md @@ -83,9 +83,7 @@ 18. Integer values that are too large to fit in `int64` will now be read as strings [#2250](https://github.com/Rdatatable/data.table/issues/2250). - -18. .shallow retains keys correctly [#2336](https://github.com/Rdatatable/data.table/issues/2336) - Thanks to @MarkusBonsch for reporting and fixing +19. Internal-only `.shallow` now retains keys correctly, [#2336](https://github.com/Rdatatable/data.table/issues/2336). Thanks to @MarkusBonsch for reporting, fixing ([PR #2337](https://github.com/Rdatatable/data.table/pull/2337)) and adding 37 tests. This much advances the journey towards exporting `shallow()`, [#2323](https://github.com/Rdatatable/data.table/issues/2323). #### NOTES