Anirban166 · Anirban166 · Apr 6, 2024 · Jun 26, 2020 · Mar 19, 2024 · Mar 19, 2024
diff --git a/NEWS.md b/NEWS.md
@@ -169,6 +169,9 @@ unit = "s")
 
 10. Starting from 4.0.0, data.table is using R's `rbind` and `cbind` methods, as described in v1.12.6 news entry. Support for R 3.x.x is resolved when processing `NAMESPACE` file, at install time, or at the time of building package binaries. As a result, users on R 3.x.x, if installing from binaries, must use binaries built by R 3.x.x, and users on R 4.x.x, if installing from binaries, must use binaries built by R 4.x.x. Users will see `package ‘data.table’ was built under R version...` warning when this happen. Thanks to @vinhdizzo for reporting in [#4528](https://github.com/Rdatatable/data.table/issues/4528).
 
+11. Internal function `shallow()` no longer makes a deep copy of secondary indices. This eliminates a relatively small time and memory overhead when indices are present that added up significantly when performing many operations, such as joins, in a loop or when joining in `j` by group, [#4311](https://github.com/Rdatatable/data.table/issues/4311). Many thanks to @renkun-ken for the report, and @tlapak for the investigation and PR.
+
+
 # data.table [v1.12.8](https://github.com/Rdatatable/data.table/milestone/15?closed=1)  (09 Dec 2019)
 
 ## NEW FEATURES

diff --git a/inst/atime/tests.R b/inst/atime/tests.R
@@ -0,0 +1,84 @@
+pkg.edit.fun = quote(function(old.Package, new.Package, sha, new.pkg.path) {
+      pkg_find_replace <- function(glob, FIND, REPLACE) {
+        atime::glob_find_replace(file.path(new.pkg.path, glob), FIND, REPLACE)
+      }
+      Package_regex <- gsub(".", "_?", old.Package, fixed = TRUE)
+      Package_ <- gsub(".", "_", old.Package, fixed = TRUE)
+      new.Package_ <- paste0(Package_, "_", sha)
+      pkg_find_replace(
+        "DESCRIPTION", 
+        paste0("Package:\\s+", old.Package),
+        paste("Package:", new.Package))
+      pkg_find_replace(
+        file.path("src", "Makevars.*in"),
+        Package_regex,
+        new.Package_)
+      pkg_find_replace(
+        file.path("R", "onLoad.R"),
+        Package_regex,
+        new.Package_)
+      pkg_find_replace(
+        file.path("R", "onLoad.R"),
+        sprintf('packageVersion\\("%s"\\)', old.Package),
+        sprintf('packageVersion\\("%s"\\)', new.Package))
+      pkg_find_replace(
+        file.path("src", "init.c"),
+        paste0("R_init_", Package_regex),
+        paste0("R_init_", gsub("[.]", "_", new.Package_)))
+      pkg_find_replace(
+        "NAMESPACE",
+        sprintf('useDynLib\\("?%s"?', Package_regex),
+        paste0('useDynLib(', new.Package_))
+    })
+
+test.list <- list(
+  # Performance regression fixed in: https://github.com/Rdatatable/data.table/pull/4440
+  "Test regression fixed in #4440" = list(
+    pkg.edit.fun = pkg.edit.fun,
+    N = 10^seq(3,8),
+    setup = quote({
+      set.seed(1L)
+      dt <- data.table(a = sample(N, N))
+      setindex(dt, a)
+    }),
+    expr = quote(data.table:::shallow(dt)),
+    "Before"="9d3b9202fddb980345025a4f6ac451ed26a423be",
+    "Regression"="752012f577f8e268bb6d0084ca39a09fa7fbc1c4", 
+    "Fixed"="9d3b9202fddb980345025a4f6ac451ed26a423be"),
+
+  # Test based on https://github.com/Rdatatable/data.table/issues/5424
+  # Performance regression introduced in: https://github.com/Rdatatable/data.table/pull/4491
+  "Test regression introduced from #4491" = list(
+    pkg.edit.fun = pkg.edit.fun,
+    N = 10^seq(3, 8),
+    expr = quote(data.table:::`[.data.table`(dt_mod, , N := .N, by = g)),
+    setup = quote({
+      n <- N/100
+      set.seed(1L)
+      dt <- data.table(
+        g = sample(seq_len(n), N, TRUE),
+        x = runif(N),
+        key = "g")
+      dt_mod <- copy(dt)
+    }),
+    "Before"="be2f72e6f5c90622fe72e1c315ca05769a9dc854",
+    "Regression"="e793f53466d99f86e70fc2611b708ae8c601a451",
+    "Fixed"="58409197426ced4714af842650b0cc3b9e2cb842")
+
+  # Test based on https://github.com/Rdatatable/data.table/issues/4200
+  # Performance regression fixed in: https://github.com/Rdatatable/data.table/pull/4558
+  #"Test regression fixed in #4558" = list(
+  #pkg.edit.fun = pkg.edit.fun,
+  #N = 10^seq(1, 20),
+  #expr = quote(data.table:::`[.data.table`(d, , (max(v1) - min(v2)), by = id3)),
+  #setup = quote({ 
+  #  set.seed(108)
+  #  d <- data.table(
+  #    id3 = sample(c(seq.int(N * 0.9), sample(N * 0.9, N * 0.1, TRUE))),
+  #    v1 = sample(5L, N, TRUE),
+  #    v2 = sample(5L, N, TRUE))
+  #  }),
+  #  "Before" = "15f0598b9828d3af2eb8ddc9b38e0356f42afe4f",
+  #  "Regression" = "6f360be0b2a6cf425f6df751ca9a99ec5d35ed93",
+  #  "Fixed" = "ba32f3cba38ec270587e395f6e6c26a80be36be6")       
+)
diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
@@ -12871,30 +12871,32 @@ ids <- sample(letters[1:3], 10, replace=TRUE)
 scores <- rnorm(10)
 dt <- data.table(id=ids, score=scores)
 dt.s4 <- new("Data.Table", data.table(id=ids, score=scores))
-test(1914.1, isS4(dt.s4))
-test(1914.2, inherits(dt.s4, 'data.table'))
+test(1914.01, isS4(dt.s4))
+test(1914.02, inherits(dt.s4, 'data.table'))
+# Test possible regression. shallow() needs to preserve the S4 bit to support S4 classes that contain data.table
+test(1914.03, isS4(shallow(dt.s4)))
 ## pull out data from S4 as.list, and compare to list from dt
 dt.s4.list <- dt.s4@.Data
 names(dt.s4.list) <- names(dt.s4)
-test(1914.3, dt.s4.list, as.list(dt))  # Underlying data not identical
+test(1914.04, dt.s4.list, as.list(dt))  # Underlying data not identical
 # simple S4 conversion-isms work
 df = data.frame(a=sample(letters, 10), b=1:10)
 dt = as.data.table(df)
-test(1914.4, identical(as(df, 'data.table'), dt))
-test(1914.5, identical(as(dt, 'data.frame'), df))
+test(1914.05, identical(as(df, 'data.table'), dt))
+test(1914.06, identical(as(dt, 'data.frame'), df))
 # data.table can be used in an S4 slot
 dt <- data.table(a=sample(letters[1:3], 10, replace=TRUE), score=rnorm(10))
 dt.comp <- new("S4Composition", data=dt)
-test(1914.6, dt.comp@data, dt)
+test(1914.07, dt.comp@data, dt)
 # S4 methods dispatch properly on data.table slots"
 dt <- data.table(a=sample(letters[1:3], 10, replace=TRUE), score=rnorm(10))
 dt.comp <- new("S4Composition", data=dt)
 setGeneric("dtGet", function(x, what) standardGeneric("dtGet"))
 setMethod("dtGet", c(x="S4Composition", what="missing"), function(x, what){x@data})
 setMethod("dtGet", c(x="S4Composition", what="ANY"), function(x, what) {x@data[[what]]})
-test(1914.7, dtGet(dt.comp), dt)  # actually
-test(1914.8, identical(dtGet(dt.comp, 1), dt[[1]]))
-test(1914.9, identical(dtGet(dt.comp, 'b'), dt$b))
+test(1914.08, dtGet(dt.comp), dt)  # actually
+test(1914.09, identical(dtGet(dt.comp, 1), dt[[1]]))
+test(1914.10, identical(dtGet(dt.comp, 'b'), dt$b))
 removeClass("Data.Table")   # so that test 1914.2 passes on the second run of cc() in dev
 removeClass("S4Composition")
 # END port of old testthat tests
@@ -16999,3 +17001,16 @@ B = data.table(x=1:2)
 X = A == B
 A[, y := 3:4]
 test(2148, colnames(X), c('x'))
+
+# shallow() shouldn't take a deep copy of indices, #4311
+dt <- data.table(a = c(3, 1))
+setindex(dt, a)
+dt2 <- shallow(dt)
+test(2149.1, address(attr(attr(dt, 'index'), '__a')), address(attr(attr(dt2, 'index'), '__a')))
+# Testing possible future regression. shallow() needs to copy the names of indices and keys.
+setnames(dt2, 'a', 'A')
+test(2149.2, indices(dt), 'a')
+setkey(dt, a)
+dt2 <- shallow(dt)
+setnames(dt2, 'a', 'A')
+test(2149.3, key(dt), 'a')
diff --git a/src/assign.c b/src/assign.c
@@ -152,13 +152,25 @@ static SEXP shallow(SEXP dt, SEXP cols, R_len_t n)
   R_len_t i,l;
   int protecti=0;
   SEXP newdt = PROTECT(allocVector(VECSXP, n)); protecti++;   // to do, use growVector here?
-  //copyMostAttrib(dt, newdt);   // including class
-  DUPLICATE_ATTRIB(newdt, dt);
+  SET_ATTRIB(newdt, shallow_duplicate(ATTRIB(dt)));
+  SET_OBJECT(newdt, OBJECT(dt));
+  IS_S4_OBJECT(dt) ? SET_S4_OBJECT(newdt) : UNSET_S4_OBJECT(newdt);  // To support S4 objects that incude data.table
+  //SHALLOW_DUPLICATE_ATTRIB(newdt, dt);  // SHALLOW_DUPLICATE_ATTRIB would be a bit neater but is only available from R 3.3.0
+
   // TO DO: keepattr() would be faster, but can't because shallow isn't merely a shallow copy. It
   //        also increases truelength. Perhaps make that distinction, then, and split out, but marked
   //        so that the next change knows to duplicate.
-  //        Does copyMostAttrib duplicate each attrib or does it point? It seems to point, hence DUPLICATE_ATTRIB
-  //        for now otherwise example(merge.data.table) fails (since attr(d4,"sorted") gets written by setnames).
+  //        keepattr() also merely points to the entire attrbutes list and thus doesn't allow replacing
+  //        some of its elements.
+
+  // We copy all attributes that refer to column names so that calling setnames on either
+  // the original or the shallow copy doesn't break anything.
+  SEXP index = PROTECT(getAttrib(dt, sym_index)); protecti++;
+  setAttrib(newdt, sym_index, shallow_duplicate(index));
+
+  SEXP sorted = PROTECT(getAttrib(dt, sym_sorted)); protecti++;
+  setAttrib(newdt, sym_sorted, duplicate(sorted));
+
   SEXP names = PROTECT(getAttrib(dt, R_NamesSymbol)); protecti++;
   SEXP newnames = PROTECT(allocVector(STRSXP, n)); protecti++;
   if (isNull(cols)) {