From 9d3b9202fddb980345025a4f6ac451ed26a423be Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=A1clav=20Tlap=C3=A1k?= <55213630+tlapak@users.noreply.github.com> Date: Fri, 26 Jun 2020 03:24:34 +0200 Subject: [PATCH 1/6] Remove deep copy of indices from shallow() (#4440) --- NEWS.md | 3 +++ inst/tests/tests.Rraw | 33 ++++++++++++++++++++++++--------- src/assign.c | 20 ++++++++++++++++---- 3 files changed, 43 insertions(+), 13 deletions(-) diff --git a/NEWS.md b/NEWS.md index 05e595a768..2d31090120 100644 --- a/NEWS.md +++ b/NEWS.md @@ -169,6 +169,9 @@ unit = "s") 10. Starting from 4.0.0, data.table is using R's `rbind` and `cbind` methods, as described in v1.12.6 news entry. Support for R 3.x.x is resolved when processing `NAMESPACE` file, at install time, or at the time of building package binaries. As a result, users on R 3.x.x, if installing from binaries, must use binaries built by R 3.x.x, and users on R 4.x.x, if installing from binaries, must use binaries built by R 4.x.x. Users will see `package ‘data.table’ was built under R version...` warning when this happen. Thanks to @vinhdizzo for reporting in [#4528](https://github.com/Rdatatable/data.table/issues/4528). +11. Internal function `shallow()` no longer makes a deep copy of secondary indices. This eliminates a relatively small time and memory overhead when indices are present that added up significantly when performing many operations, such as joins, in a loop or when joining in `j` by group, [#4311](https://github.com/Rdatatable/data.table/issues/4311). Many thanks to @renkun-ken for the report, and @tlapak for the investigation and PR. + + # data.table [v1.12.8](https://github.com/Rdatatable/data.table/milestone/15?closed=1) (09 Dec 2019) ## NEW FEATURES diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 11168db5c5..7b3902a2b5 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -12871,30 +12871,32 @@ ids <- sample(letters[1:3], 10, replace=TRUE) scores <- rnorm(10) dt <- data.table(id=ids, score=scores) dt.s4 <- new("Data.Table", data.table(id=ids, score=scores)) -test(1914.1, isS4(dt.s4)) -test(1914.2, inherits(dt.s4, 'data.table')) +test(1914.01, isS4(dt.s4)) +test(1914.02, inherits(dt.s4, 'data.table')) +# Test possible regression. shallow() needs to preserve the S4 bit to support S4 classes that contain data.table +test(1914.03, isS4(shallow(dt.s4))) ## pull out data from S4 as.list, and compare to list from dt dt.s4.list <- dt.s4@.Data names(dt.s4.list) <- names(dt.s4) -test(1914.3, dt.s4.list, as.list(dt)) # Underlying data not identical +test(1914.04, dt.s4.list, as.list(dt)) # Underlying data not identical # simple S4 conversion-isms work df = data.frame(a=sample(letters, 10), b=1:10) dt = as.data.table(df) -test(1914.4, identical(as(df, 'data.table'), dt)) -test(1914.5, identical(as(dt, 'data.frame'), df)) +test(1914.05, identical(as(df, 'data.table'), dt)) +test(1914.06, identical(as(dt, 'data.frame'), df)) # data.table can be used in an S4 slot dt <- data.table(a=sample(letters[1:3], 10, replace=TRUE), score=rnorm(10)) dt.comp <- new("S4Composition", data=dt) -test(1914.6, dt.comp@data, dt) +test(1914.07, dt.comp@data, dt) # S4 methods dispatch properly on data.table slots" dt <- data.table(a=sample(letters[1:3], 10, replace=TRUE), score=rnorm(10)) dt.comp <- new("S4Composition", data=dt) setGeneric("dtGet", function(x, what) standardGeneric("dtGet")) setMethod("dtGet", c(x="S4Composition", what="missing"), function(x, what){x@data}) setMethod("dtGet", c(x="S4Composition", what="ANY"), function(x, what) {x@data[[what]]}) -test(1914.7, dtGet(dt.comp), dt) # actually -test(1914.8, identical(dtGet(dt.comp, 1), dt[[1]])) -test(1914.9, identical(dtGet(dt.comp, 'b'), dt$b)) +test(1914.08, dtGet(dt.comp), dt) # actually +test(1914.09, identical(dtGet(dt.comp, 1), dt[[1]])) +test(1914.10, identical(dtGet(dt.comp, 'b'), dt$b)) removeClass("Data.Table") # so that test 1914.2 passes on the second run of cc() in dev removeClass("S4Composition") # END port of old testthat tests @@ -16999,3 +17001,16 @@ B = data.table(x=1:2) X = A == B A[, y := 3:4] test(2148, colnames(X), c('x')) + +# shallow() shouldn't take a deep copy of indices, #4311 +dt <- data.table(a = c(3, 1)) +setindex(dt, a) +dt2 <- shallow(dt) +test(2149.1, address(attr(attr(dt, 'index'), '__a')), address(attr(attr(dt2, 'index'), '__a'))) +# Testing possible future regression. shallow() needs to copy the names of indices and keys. +setnames(dt2, 'a', 'A') +test(2149.2, indices(dt), 'a') +setkey(dt, a) +dt2 <- shallow(dt) +setnames(dt2, 'a', 'A') +test(2149.3, key(dt), 'a') diff --git a/src/assign.c b/src/assign.c index 1392079e72..88fc260655 100644 --- a/src/assign.c +++ b/src/assign.c @@ -152,13 +152,25 @@ static SEXP shallow(SEXP dt, SEXP cols, R_len_t n) R_len_t i,l; int protecti=0; SEXP newdt = PROTECT(allocVector(VECSXP, n)); protecti++; // to do, use growVector here? - //copyMostAttrib(dt, newdt); // including class - DUPLICATE_ATTRIB(newdt, dt); + SET_ATTRIB(newdt, shallow_duplicate(ATTRIB(dt))); + SET_OBJECT(newdt, OBJECT(dt)); + IS_S4_OBJECT(dt) ? SET_S4_OBJECT(newdt) : UNSET_S4_OBJECT(newdt); // To support S4 objects that incude data.table + //SHALLOW_DUPLICATE_ATTRIB(newdt, dt); // SHALLOW_DUPLICATE_ATTRIB would be a bit neater but is only available from R 3.3.0 + // TO DO: keepattr() would be faster, but can't because shallow isn't merely a shallow copy. It // also increases truelength. Perhaps make that distinction, then, and split out, but marked // so that the next change knows to duplicate. - // Does copyMostAttrib duplicate each attrib or does it point? It seems to point, hence DUPLICATE_ATTRIB - // for now otherwise example(merge.data.table) fails (since attr(d4,"sorted") gets written by setnames). + // keepattr() also merely points to the entire attrbutes list and thus doesn't allow replacing + // some of its elements. + + // We copy all attributes that refer to column names so that calling setnames on either + // the original or the shallow copy doesn't break anything. + SEXP index = PROTECT(getAttrib(dt, sym_index)); protecti++; + setAttrib(newdt, sym_index, shallow_duplicate(index)); + + SEXP sorted = PROTECT(getAttrib(dt, sym_sorted)); protecti++; + setAttrib(newdt, sym_sorted, duplicate(sorted)); + SEXP names = PROTECT(getAttrib(dt, R_NamesSymbol)); protecti++; SEXP newnames = PROTECT(allocVector(STRSXP, n)); protecti++; if (isNull(cols)) { From 0c682882bf91a304da6df6c1290071a277773250 Mon Sep 17 00:00:00 2001 From: Ani Date: Mon, 18 Mar 2024 23:16:45 -0700 Subject: [PATCH 2/6] Added the test --- inst/atime/tests.R | 48 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100644 inst/atime/tests.R diff --git a/inst/atime/tests.R b/inst/atime/tests.R new file mode 100644 index 0000000000..2c8074f9c1 --- /dev/null +++ b/inst/atime/tests.R @@ -0,0 +1,48 @@ +pkg.edit.fun = quote(function(old.Package, new.Package, sha, new.pkg.path) { + pkg_find_replace <- function(glob, FIND, REPLACE) { + atime::glob_find_replace(file.path(new.pkg.path, glob), FIND, REPLACE) + } + Package_regex <- gsub(".", "_?", old.Package, fixed = TRUE) + Package_ <- gsub(".", "_", old.Package, fixed = TRUE) + new.Package_ <- paste0(Package_, "_", sha) + pkg_find_replace( + "DESCRIPTION", + paste0("Package:\\s+", old.Package), + paste("Package:", new.Package)) + pkg_find_replace( + file.path("src", "Makevars.*in"), + Package_regex, + new.Package_) + pkg_find_replace( + file.path("R", "onLoad.R"), + Package_regex, + new.Package_) + pkg_find_replace( + file.path("R", "onLoad.R"), + sprintf('packageVersion\\("%s"\\)', old.Package), + sprintf('packageVersion\\("%s"\\)', new.Package)) + pkg_find_replace( + file.path("src", "init.c"), + paste0("R_init_", Package_regex), + paste0("R_init_", gsub("[.]", "_", new.Package_))) + pkg_find_replace( + "NAMESPACE", + sprintf('useDynLib\\("?%s"?', Package_regex), + paste0('useDynLib(', new.Package_)) + }) + +test.list <- list( + # Performance regression fixed in: https://github.com/Rdatatable/data.table/pull/4440 + "Test regression fixed in #4440" = list( + pkg.edit.fun = pkg.edit.fun, + N = 10^seq(3,8), + setup = quote({ + set.seed(1L) + dt <- data.table(a = sample(N, N)) + setindex(dt, a) + }), + expr = quote(data.table:::shallow(dt)), + "Before"="ad7b67c80a551b7a1e2ef8b73d6162ed7737c934", + "Regression"="752012f577f8e268bb6d0084ca39a09fa7fbc1c4", + "Fixed"="9d3b9202fddb980345025a4f6ac451ed26a423be") +) From a763727bb5184dac1a271613d8d2bdfb896b508b Mon Sep 17 00:00:00 2001 From: Ani Date: Mon, 18 Mar 2024 23:45:33 -0700 Subject: [PATCH 3/6] Added the other (unrelated) test cases --- inst/atime/tests.R | 38 +++++++++++++++++++++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) diff --git a/inst/atime/tests.R b/inst/atime/tests.R index 2c8074f9c1..29fffb5ff2 100644 --- a/inst/atime/tests.R +++ b/inst/atime/tests.R @@ -44,5 +44,41 @@ test.list <- list( expr = quote(data.table:::shallow(dt)), "Before"="ad7b67c80a551b7a1e2ef8b73d6162ed7737c934", "Regression"="752012f577f8e268bb6d0084ca39a09fa7fbc1c4", - "Fixed"="9d3b9202fddb980345025a4f6ac451ed26a423be") + "Fixed"="9d3b9202fddb980345025a4f6ac451ed26a423be"), + + # Test based on https://github.com/Rdatatable/data.table/issues/5424 + # Performance regression introduced in: https://github.com/Rdatatable/data.table/pull/4491 + "Test regression introduced from #4491" = list( + pkg.edit.fun = pkg.edit.fun, + N = 10^seq(3, 8), + expr = quote(data.table:::`[.data.table`(dt_mod, , N := .N, by = g)), + setup = quote({ + n <- N/100 + set.seed(1L) + dt <- data.table( + g = sample(seq_len(n), N, TRUE), + x = runif(N), + key = "g") + dt_mod <- copy(dt) + }), + "Before"="be2f72e6f5c90622fe72e1c315ca05769a9dc854", + "Regression"="e793f53466d99f86e70fc2611b708ae8c601a451", + "Fixed"="58409197426ced4714af842650b0cc3b9e2cb842"), + + # Test based on https://github.com/Rdatatable/data.table/issues/4200 + # Performance regression fixed in: https://github.com/Rdatatable/data.table/pull/4558 + "Test regression fixed in #4558" = list( + pkg.edit.fun = pkg.edit.fun, + N = 10^seq(1, 20), + expr = quote(data.table:::`[.data.table`(d, , (max(v1) - min(v2)), by = id3)), + setup = quote({ + set.seed(108) + d <- data.table( + id3 = sample(c(seq.int(N * 0.9), sample(N * 0.9, N * 0.1, TRUE))), + v1 = sample(5L, N, TRUE), + v2 = sample(5L, N, TRUE)) + }), + "Before" = "15f0598b9828d3af2eb8ddc9b38e0356f42afe4f", + "Regression" = "6f360be0b2a6cf425f6df751ca9a99ec5d35ed93", + "Fixed" = "ba32f3cba38ec270587e395f6e6c26a80be36be6") ) From 6bf7ad3b225ace9f86944171ed07fb3286c2cd9f Mon Sep 17 00:00:00 2001 From: Ani Date: Fri, 5 Apr 2024 02:37:11 -0700 Subject: [PATCH 4/6] Test commit to trigger my updated workflow --- inst/atime/tests.R | 1 + 1 file changed, 1 insertion(+) diff --git a/inst/atime/tests.R b/inst/atime/tests.R index 29fffb5ff2..e774fb05c9 100644 --- a/inst/atime/tests.R +++ b/inst/atime/tests.R @@ -82,3 +82,4 @@ test.list <- list( "Regression" = "6f360be0b2a6cf425f6df751ca9a99ec5d35ed93", "Fixed" = "ba32f3cba38ec270587e395f6e6c26a80be36be6") ) +# Test commit to trigger my updated workflow for this PR. From 245ad43eceb977f35cd9144da3e01de1b4b64901 Mon Sep 17 00:00:00 2001 From: Ani Date: Fri, 5 Apr 2024 10:05:27 -0700 Subject: [PATCH 5/6] Update 'Before' (older commits such as 1.12.6 and 1.12.4 are still not working, so a placeholder till then) --- inst/atime/tests.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inst/atime/tests.R b/inst/atime/tests.R index e774fb05c9..cb97838019 100644 --- a/inst/atime/tests.R +++ b/inst/atime/tests.R @@ -42,7 +42,7 @@ test.list <- list( setindex(dt, a) }), expr = quote(data.table:::shallow(dt)), - "Before"="ad7b67c80a551b7a1e2ef8b73d6162ed7737c934", + "Before"="9d3b9202fddb980345025a4f6ac451ed26a423be", "Regression"="752012f577f8e268bb6d0084ca39a09fa7fbc1c4", "Fixed"="9d3b9202fddb980345025a4f6ac451ed26a423be"), From c2a47bc738234d473e0038c6b9a8c77a520f4338 Mon Sep 17 00:00:00 2001 From: Ani Date: Fri, 5 Apr 2024 13:21:46 -0700 Subject: [PATCH 6/6] Omit one test case for a 1:1 historical regression comparison between the remaining two cases --- inst/atime/tests.R | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/inst/atime/tests.R b/inst/atime/tests.R index cb97838019..c555e967e0 100644 --- a/inst/atime/tests.R +++ b/inst/atime/tests.R @@ -63,23 +63,22 @@ test.list <- list( }), "Before"="be2f72e6f5c90622fe72e1c315ca05769a9dc854", "Regression"="e793f53466d99f86e70fc2611b708ae8c601a451", - "Fixed"="58409197426ced4714af842650b0cc3b9e2cb842"), + "Fixed"="58409197426ced4714af842650b0cc3b9e2cb842") # Test based on https://github.com/Rdatatable/data.table/issues/4200 # Performance regression fixed in: https://github.com/Rdatatable/data.table/pull/4558 - "Test regression fixed in #4558" = list( - pkg.edit.fun = pkg.edit.fun, - N = 10^seq(1, 20), - expr = quote(data.table:::`[.data.table`(d, , (max(v1) - min(v2)), by = id3)), - setup = quote({ - set.seed(108) - d <- data.table( - id3 = sample(c(seq.int(N * 0.9), sample(N * 0.9, N * 0.1, TRUE))), - v1 = sample(5L, N, TRUE), - v2 = sample(5L, N, TRUE)) - }), - "Before" = "15f0598b9828d3af2eb8ddc9b38e0356f42afe4f", - "Regression" = "6f360be0b2a6cf425f6df751ca9a99ec5d35ed93", - "Fixed" = "ba32f3cba38ec270587e395f6e6c26a80be36be6") + #"Test regression fixed in #4558" = list( + #pkg.edit.fun = pkg.edit.fun, + #N = 10^seq(1, 20), + #expr = quote(data.table:::`[.data.table`(d, , (max(v1) - min(v2)), by = id3)), + #setup = quote({ + # set.seed(108) + # d <- data.table( + # id3 = sample(c(seq.int(N * 0.9), sample(N * 0.9, N * 0.1, TRUE))), + # v1 = sample(5L, N, TRUE), + # v2 = sample(5L, N, TRUE)) + # }), + # "Before" = "15f0598b9828d3af2eb8ddc9b38e0356f42afe4f", + # "Regression" = "6f360be0b2a6cf425f6df751ca9a99ec5d35ed93", + # "Fixed" = "ba32f3cba38ec270587e395f6e6c26a80be36be6") ) -# Test commit to trigger my updated workflow for this PR.