Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,9 @@ unit = "s")

10. Starting from 4.0.0, data.table is using R's `rbind` and `cbind` methods, as described in v1.12.6 news entry. Support for R 3.x.x is resolved when processing `NAMESPACE` file, at install time, or at the time of building package binaries. As a result, users on R 3.x.x, if installing from binaries, must use binaries built by R 3.x.x, and users on R 4.x.x, if installing from binaries, must use binaries built by R 4.x.x. Users will see `package ‘data.table’ was built under R version...` warning when this happen. Thanks to @vinhdizzo for reporting in [#4528](https://github.com/Rdatatable/data.table/issues/4528).

11. Internal function `shallow()` no longer makes a deep copy of secondary indices. This eliminates a relatively small time and memory overhead when indices are present that added up significantly when performing many operations, such as joins, in a loop or when joining in `j` by group, [#4311](https://github.com/Rdatatable/data.table/issues/4311). Many thanks to @renkun-ken for the report, and @tlapak for the investigation and PR.


# data.table [v1.12.8](https://github.com/Rdatatable/data.table/milestone/15?closed=1) (09 Dec 2019)

## NEW FEATURES
Expand Down
84 changes: 84 additions & 0 deletions inst/atime/tests.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
pkg.edit.fun = quote(function(old.Package, new.Package, sha, new.pkg.path) {
pkg_find_replace <- function(glob, FIND, REPLACE) {
atime::glob_find_replace(file.path(new.pkg.path, glob), FIND, REPLACE)
}
Package_regex <- gsub(".", "_?", old.Package, fixed = TRUE)
Package_ <- gsub(".", "_", old.Package, fixed = TRUE)
new.Package_ <- paste0(Package_, "_", sha)
pkg_find_replace(
"DESCRIPTION",
paste0("Package:\\s+", old.Package),
paste("Package:", new.Package))
pkg_find_replace(
file.path("src", "Makevars.*in"),
Package_regex,
new.Package_)
pkg_find_replace(
file.path("R", "onLoad.R"),
Package_regex,
new.Package_)
pkg_find_replace(
file.path("R", "onLoad.R"),
sprintf('packageVersion\\("%s"\\)', old.Package),
sprintf('packageVersion\\("%s"\\)', new.Package))
pkg_find_replace(
file.path("src", "init.c"),
paste0("R_init_", Package_regex),
paste0("R_init_", gsub("[.]", "_", new.Package_)))
pkg_find_replace(
"NAMESPACE",
sprintf('useDynLib\\("?%s"?', Package_regex),
paste0('useDynLib(', new.Package_))
})

test.list <- list(
# Performance regression fixed in: https://github.com/Rdatatable/data.table/pull/4440
"Test regression fixed in #4440" = list(
pkg.edit.fun = pkg.edit.fun,
N = 10^seq(3,8),
setup = quote({
set.seed(1L)
dt <- data.table(a = sample(N, N))
setindex(dt, a)
}),
expr = quote(data.table:::shallow(dt)),
"Before"="9d3b9202fddb980345025a4f6ac451ed26a423be",
"Regression"="752012f577f8e268bb6d0084ca39a09fa7fbc1c4",
"Fixed"="9d3b9202fddb980345025a4f6ac451ed26a423be"),

# Test based on https://github.com/Rdatatable/data.table/issues/5424
# Performance regression introduced in: https://github.com/Rdatatable/data.table/pull/4491
"Test regression introduced from #4491" = list(
pkg.edit.fun = pkg.edit.fun,
N = 10^seq(3, 8),
expr = quote(data.table:::`[.data.table`(dt_mod, , N := .N, by = g)),
setup = quote({
n <- N/100
set.seed(1L)
dt <- data.table(
g = sample(seq_len(n), N, TRUE),
x = runif(N),
key = "g")
dt_mod <- copy(dt)
}),
"Before"="be2f72e6f5c90622fe72e1c315ca05769a9dc854",
"Regression"="e793f53466d99f86e70fc2611b708ae8c601a451",
"Fixed"="58409197426ced4714af842650b0cc3b9e2cb842")

# Test based on https://github.com/Rdatatable/data.table/issues/4200
# Performance regression fixed in: https://github.com/Rdatatable/data.table/pull/4558
#"Test regression fixed in #4558" = list(
#pkg.edit.fun = pkg.edit.fun,
#N = 10^seq(1, 20),
#expr = quote(data.table:::`[.data.table`(d, , (max(v1) - min(v2)), by = id3)),
#setup = quote({
# set.seed(108)
# d <- data.table(
# id3 = sample(c(seq.int(N * 0.9), sample(N * 0.9, N * 0.1, TRUE))),
# v1 = sample(5L, N, TRUE),
# v2 = sample(5L, N, TRUE))
# }),
# "Before" = "15f0598b9828d3af2eb8ddc9b38e0356f42afe4f",
# "Regression" = "6f360be0b2a6cf425f6df751ca9a99ec5d35ed93",
# "Fixed" = "ba32f3cba38ec270587e395f6e6c26a80be36be6")
)
33 changes: 24 additions & 9 deletions inst/tests/tests.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -12871,30 +12871,32 @@ ids <- sample(letters[1:3], 10, replace=TRUE)
scores <- rnorm(10)
dt <- data.table(id=ids, score=scores)
dt.s4 <- new("Data.Table", data.table(id=ids, score=scores))
test(1914.1, isS4(dt.s4))
test(1914.2, inherits(dt.s4, 'data.table'))
test(1914.01, isS4(dt.s4))
test(1914.02, inherits(dt.s4, 'data.table'))
# Test possible regression. shallow() needs to preserve the S4 bit to support S4 classes that contain data.table
test(1914.03, isS4(shallow(dt.s4)))
## pull out data from S4 as.list, and compare to list from dt
dt.s4.list <- dt.s4@.Data
names(dt.s4.list) <- names(dt.s4)
test(1914.3, dt.s4.list, as.list(dt)) # Underlying data not identical
test(1914.04, dt.s4.list, as.list(dt)) # Underlying data not identical
# simple S4 conversion-isms work
df = data.frame(a=sample(letters, 10), b=1:10)
dt = as.data.table(df)
test(1914.4, identical(as(df, 'data.table'), dt))
test(1914.5, identical(as(dt, 'data.frame'), df))
test(1914.05, identical(as(df, 'data.table'), dt))
test(1914.06, identical(as(dt, 'data.frame'), df))
# data.table can be used in an S4 slot
dt <- data.table(a=sample(letters[1:3], 10, replace=TRUE), score=rnorm(10))
dt.comp <- new("S4Composition", data=dt)
test(1914.6, dt.comp@data, dt)
test(1914.07, dt.comp@data, dt)
# S4 methods dispatch properly on data.table slots"
dt <- data.table(a=sample(letters[1:3], 10, replace=TRUE), score=rnorm(10))
dt.comp <- new("S4Composition", data=dt)
setGeneric("dtGet", function(x, what) standardGeneric("dtGet"))
setMethod("dtGet", c(x="S4Composition", what="missing"), function(x, what){x@data})
setMethod("dtGet", c(x="S4Composition", what="ANY"), function(x, what) {x@data[[what]]})
test(1914.7, dtGet(dt.comp), dt) # actually
test(1914.8, identical(dtGet(dt.comp, 1), dt[[1]]))
test(1914.9, identical(dtGet(dt.comp, 'b'), dt$b))
test(1914.08, dtGet(dt.comp), dt) # actually
test(1914.09, identical(dtGet(dt.comp, 1), dt[[1]]))
test(1914.10, identical(dtGet(dt.comp, 'b'), dt$b))
removeClass("Data.Table") # so that test 1914.2 passes on the second run of cc() in dev
removeClass("S4Composition")
# END port of old testthat tests
Expand Down Expand Up @@ -16999,3 +17001,16 @@ B = data.table(x=1:2)
X = A == B
A[, y := 3:4]
test(2148, colnames(X), c('x'))

# shallow() shouldn't take a deep copy of indices, #4311
dt <- data.table(a = c(3, 1))
setindex(dt, a)
dt2 <- shallow(dt)
test(2149.1, address(attr(attr(dt, 'index'), '__a')), address(attr(attr(dt2, 'index'), '__a')))
# Testing possible future regression. shallow() needs to copy the names of indices and keys.
setnames(dt2, 'a', 'A')
test(2149.2, indices(dt), 'a')
setkey(dt, a)
dt2 <- shallow(dt)
setnames(dt2, 'a', 'A')
test(2149.3, key(dt), 'a')
20 changes: 16 additions & 4 deletions src/assign.c
Original file line number Diff line number Diff line change
Expand Up @@ -152,13 +152,25 @@ static SEXP shallow(SEXP dt, SEXP cols, R_len_t n)
R_len_t i,l;
int protecti=0;
SEXP newdt = PROTECT(allocVector(VECSXP, n)); protecti++; // to do, use growVector here?
//copyMostAttrib(dt, newdt); // including class
DUPLICATE_ATTRIB(newdt, dt);
SET_ATTRIB(newdt, shallow_duplicate(ATTRIB(dt)));
SET_OBJECT(newdt, OBJECT(dt));
IS_S4_OBJECT(dt) ? SET_S4_OBJECT(newdt) : UNSET_S4_OBJECT(newdt); // To support S4 objects that incude data.table
//SHALLOW_DUPLICATE_ATTRIB(newdt, dt); // SHALLOW_DUPLICATE_ATTRIB would be a bit neater but is only available from R 3.3.0

// TO DO: keepattr() would be faster, but can't because shallow isn't merely a shallow copy. It
// also increases truelength. Perhaps make that distinction, then, and split out, but marked
// so that the next change knows to duplicate.
// Does copyMostAttrib duplicate each attrib or does it point? It seems to point, hence DUPLICATE_ATTRIB
// for now otherwise example(merge.data.table) fails (since attr(d4,"sorted") gets written by setnames).
// keepattr() also merely points to the entire attrbutes list and thus doesn't allow replacing
// some of its elements.

// We copy all attributes that refer to column names so that calling setnames on either
// the original or the shallow copy doesn't break anything.
SEXP index = PROTECT(getAttrib(dt, sym_index)); protecti++;
setAttrib(newdt, sym_index, shallow_duplicate(index));

SEXP sorted = PROTECT(getAttrib(dt, sym_sorted)); protecti++;
setAttrib(newdt, sym_sorted, duplicate(sorted));

SEXP names = PROTECT(getAttrib(dt, R_NamesSymbol)); protecti++;
SEXP newnames = PROTECT(allocVector(STRSXP, n)); protecti++;
if (isNull(cols)) {
Expand Down