Rdatatable · mattdowle · Sep 11, 2017 · Sep 7, 2017 · Sep 8, 2017 · Sep 8, 2017
@@ -83,7 +83,8 @@
 
 18. Integer values that are too large to fit in `int64` will now be read as strings [#2250](https://github.com/Rdatatable/data.table/issues/2250).
 
-
+19. Internal-only `.shallow` now retains keys correctly, [#2336](https://github.com/Rdatatable/data.table/issues/2336). Thanks to @MarkusBonsch for reporting, fixing ([PR #2337](https://github.com/Rdatatable/data.table/pull/2337)) and adding 37 tests. This much advances the journey towards exporting `shallow()`, [#2323](https://github.com/Rdatatable/data.table/issues/2323).
+
 #### NOTES
 
 1. `?data.table` makes explicit the option of using a `logical` vector in `j` to select columns, [#1978](https://github.com/Rdatatable/data.table/issues/1978). Thanks @Henrik-P for the note and @MichaelChirico for filing.

@@ -2355,14 +2355,41 @@ point <- function(to, to_idx, from, from_idx) {
     isnull = is.null(cols)
     if (!isnull) cols = validate(cols, x)  # NULL is default = all columns
     ans = .Call(Cshallowwrapper, x, cols)  # copies VECSXP only
-    if (retain.key && isnull) return(ans)  # handle most frequent case first
-    # rest of the cases
-    cols = names(x)[cols]
-    retain.key = retain.key && identical(cols, head(key(x), length(cols)))
-    setattr(ans, 'sorted', if (haskey(x) && retain.key) cols else NULL)
+
+    if(retain.key){
+      if(isnull) return(ans) # handle most frequent case first
+      ## get correct key if cols are present
+      cols = names(x)[cols]
+      keylength <- which.first(!key(ans) %chin% cols) - 1L
+      if(is.na(keylength)) keylength <- length(key(ans))
+      if(!keylength){
+        setattr(ans, "sorted", NULL) ## no key remaining
+      } else {
+        setattr(ans, "sorted", head(key(ans), keylength)) ## keep what can be kept
+      }
+      ## take care of attributes.
+      indices <- names(attributes(attr(ans, "index")))
+      for(index in indices){
+        indexcols <- strsplit(index, split = "__")[[1]][-1L]
+        indexlength <- which.first(!indexcols %chin% cols) - 1L
+        if(is.na(indexlength)) next ## all columns are present, nothing to be done
+        reducedindex <- paste0(c("", indexcols[seq_len(indexlength)]), collapse = "__") ## the columns until the first missing form the new index
+        if(reducedindex %chin% indices || !indexlength){
+          ## Either reduced index already present or no columns of the original index remain.
+          ## Drop the original index completely
+          setattr(attr(ans, "index", exact = TRUE), index, NULL)
+        } else {
+          ## rename index to reducedindex
+          names(attributes(attr(ans, "index")))[names(attributes(attr(ans, "index"))) == index] <- reducedindex
+        }
+      }
+    } else { # retain.key == FALSE
+      setattr(ans, "sorted", NULL)
+      setattr(ans, "index", NULL)
+    }
     if (unlock) setattr(ans, '.data.table.locked', NULL)
     ans
-    # TODO: check/remove attributes for secondary keys?
+
 }
 
 shallow <- function(x, cols=NULL) {

@@ -75,9 +75,6 @@ foverlaps <- function(x, y, by.x = if (!is.null(key(x))) key(x) else key(y), by.
   ## hopefully all checks are over. Now onto the actual task at hand.
   origx = x; x = shallow(x, by.x)
   origy = y; y = shallow(y, by.y)
-  if (identical(by.x, key(origx)[seq_along(by.x)]))
-    setattr(x, 'sorted', by.x)
-  setattr(y, 'sorted', by.y) ## is definitely sorted on by.y
   roll = switch(type, start=, end=, equal= 0.0, any=, within= +Inf)
   make_call <- function(names, fun=NULL) {
     if (is.character(names))
@@ -113,7 +110,7 @@ foverlaps <- function(x, y, by.x = if (!is.null(key(x))) key(x) else key(y), by.
   if (verbose) {cat(round(proc.time()[3]-last.started.at,3),"secs\n");flush.console}
   matches <- function(ii, xx, del, ...) {
     cols = setdiff(names(xx), del)
-    xx = shallow(xx, cols)
+    xx = .shallow(xx, cols, retain.key = FALSE)
     ans = bmerge(xx, ii, seq_along(xx), seq_along(xx), haskey(xx), integer(0), mult=mult, ops=rep(1L, length(xx)), integer(0), 1L, verbose=verbose, ...)
     # vecseq part should never run here, but still...
     if (ans$allLen1) ans$starts else vecseq(ans$starts, ans$lens, NULL)

@@ -6843,11 +6843,59 @@ setkey(x1, a1, a2)
 test(1544.1, setDF(merge(x1, y)), merge(as.data.frame(x1), as.data.frame(y)))
 test(1544.2, setDF(merge(x1, y, by="a2")), merge(as.data.frame(x1), as.data.frame(y), by="a2"))
 # also test shallow here so as to catch future regressions
-x1 <- data.table(a1 = c('a', 'b', 'c'), a2 = c(1L, 3L, 2L), key="a1,a2")
+x1 <- data.table(a1 = c('a', 'b', 'c'), a2 = c(1L, 3L, 2L), a3 = c(TRUE, FALSE, TRUE), key="a1,a2")
 test(1545.1, key(.shallow(x1, cols="a2")), NULL)
 test(1545.2, key(.shallow(x1, retain.key=FALSE)), NULL)
+test(1545.2, key(.shallow(x1, cols = "a1", retain.key=FALSE)), NULL)
 test(1545.3, key(.shallow(x1, retain.key=TRUE)), key(x1))
 test(1545.4, key(.shallow(x1, cols="a1", retain.key=TRUE)), "a1")
+# tests for #2336. .shallow drops keys unnecessarily
+test(1545.5, key(.shallow(x1, cols=c("a1", "a3"), retain.key=TRUE)), "a1")
+test(1545.6, .shallow(x1, cols=c("a3", "a1"), retain.key=TRUE), .shallow(x1, cols=c("a3", "a1"), retain.key=TRUE))
+test(1545.7, key(.shallow(x1, cols=c("a1", "a2",  "a3"), retain.key=TRUE)), c("a1", "a2"))
+test(1545.8, key(.shallow(x1, cols=c("a2",  "a3"), retain.key=TRUE)), NULL)
+test(1545.9, key(.shallow(x1, cols=c("a2"), retain.key=TRUE)), NULL)
+test(1545.10, key(.shallow(x1, cols=c("a3"), retain.key=TRUE)), NULL)
+setkey(x1, NULL)
+test(1545.11, key(.shallow(x1, retain.key=TRUE)), NULL)
+test(1545.111, key(.shallow(x1, retain.key=FALSE)), NULL)
+test(1545.12, key(.shallow(x1, cols=c("a1", "a2"), retain.key=TRUE)), NULL)
+test(1545.121, key(.shallow(x1, cols=c("a1", "a2"), retain.key=FALSE)), NULL)
+x1 <- x1[0]
+test(1545.13,  key(.shallow(x1, retain.key=TRUE)), NULL)
+test(1545.131, key(.shallow(x1, retain.key=FALSE)), NULL)
+test(1545.132, key(.shallow(x1, cols = c("a1"), retain.key=FALSE)), NULL)
+test(1545.133, key(.shallow(x1, cols=c("a1", "a2"), retain.key=TRUE)), NULL)
+setkey(x1, a1)
+test(1545.134, key(.shallow(x1, retain.key=FALSE)), NULL)
+test(1545.135, key(.shallow(x1, cols = "a2", retain.key=FALSE)), NULL)
+test(1545.136, key(.shallow(x1, retain.key=TRUE)), "a1")
+test(1545.137, key(.shallow(x1, cols=c("a1", "a2"), retain.key=TRUE)), "a1")
+test(1545.138, key(.shallow(x1, cols=c("a3"), retain.key=TRUE)), NULL)
+
+# tests for #2336. .shallow now retains indices as well
+x1 <- data.table(a1 = c('a', 'a', 'a', 'a', 'b', 'c'), a2 = c(2L, 2L, 1L, 2L, 3L, 2L), a3 = c(FALSE, TRUE, TRUE, FALSE, FALSE, TRUE), key="a1,a2")
+setindex(x1, a1, a2, a3)
+setindex(x1, a1, a3)
+test(1545.15, indices(.shallow(x1, retain.key=FALSE)), NULL)
+test(1545.16, indices(.shallow(x1, cols = "a1", retain.key=FALSE)), NULL)
+test(1545.17, indices(.shallow(x1, retain.key=TRUE)), indices(x1))
+test(1545.18, forderv(.shallow(x1, retain.key=TRUE)[attr(attr(.shallow(x1, retain.key=TRUE), "index"), "__a1__a2__a3")], c("a1", "a2", "a3")), integer(0))
+test(1545.19, forderv(.shallow(x1, retain.key=TRUE)[attr(attr(.shallow(x1, retain.key=TRUE), "index"), "__a1__a3")], c("a1", "a3")), integer(0))
+test(1545.20, forderv(.shallow(x1, retain.key=TRUE)[attr(attr(.shallow(x1, retain.key=TRUE), "index"), "__a1")], c("a1")), integer(0))
+test(1545.21, indices(.shallow(x1, cols = "a1", retain.key=TRUE)), c("a1"))
+test(1545.22, forderv(.shallow(x1, cols = "a1", retain.key=TRUE)[attr(attr(.shallow(x1, cols = "a1", retain.key=TRUE), "index"), "__a1")], c("a1")), integer(0))
+test(1545.23, attributes(attr(.shallow(x1, cols = c("a1", "a2"), retain.key = TRUE), "index", exact = TRUE)), attributes(attr(.shallow(x1, cols = c("a2", "a1"), retain.key = TRUE), "index", exact = TRUE)))
+test(1545.24, indices(.shallow(x1, cols = c("a1", "a2"), retain.key=TRUE)), c("a1__a2", "a1"))
+test(1545.25, forderv(.shallow(x1, cols = c("a1", "a2"), retain.key=TRUE)[attr(attr(.shallow(x1, cols = c("a1", "a2"), retain.key=TRUE), "index"), "__a1")], c("a1")), integer(0))
+test(1545.26, forderv(.shallow(x1, cols = c("a1", "a2"), retain.key=TRUE)[attr(attr(.shallow(x1, cols = c("a1", "a2"), retain.key=TRUE), "index"), "__a1__a2")], c("a1", "a2")), integer(0))
+test(1545.27, indices(.shallow(x1, cols = c("a1", "a3"), retain.key=TRUE)), c("a1", "a1__a3"))
+test(1545.28, forderv(.shallow(x1, cols = c("a1", "a3"), retain.key=TRUE)[attr(attr(.shallow(x1, cols = c("a1", "a3"), retain.key=TRUE), "index"), "__a1")], c("a1")), integer(0))
+test(1545.29, forderv(.shallow(x1, cols = c("a1", "a3"), retain.key=TRUE)[attr(attr(.shallow(x1, cols = c("a1", "a3"), retain.key=TRUE), "index"), "__a1__a3")], c("a1", "a3")), integer(0))
+test(1545.30, indices(.shallow(x1, cols = c("a2", "a3"), retain.key=TRUE)), NULL)
+test(1545.31, indices(.shallow(x1, cols = c("a3"), retain.key=TRUE)), NULL)
+test(1545.32, .shallow(x1, cols = c("a1", "a2", "a3"), retain.key=TRUE), .shallow(x1, retain.key=TRUE))
+
 
 # test for #1234
 df1 = df2 = data.frame(cats = rep(c('', ' ',  'meow'), 5))