From 316c42bdc7c25155e460896ae89ba2d6f34d24c9 Mon Sep 17 00:00:00 2001 From: Cole Miller <57992489+ColeMiller1@users.noreply.github.com> Date: Tue, 7 Jan 2020 19:46:17 -0500 Subject: [PATCH 01/42] Update data.table.R --- R/data.table.R | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index 1fee0396a9..8f08415302 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1027,10 +1027,14 @@ replace_dot_alias = function(e) { lhs = jsub[[2L]] jsub = jsub[[3L]] if (is.name(lhs)) { - lhs = as.character(lhs) + if (deparse(lhs) %chin% c('.SD', '.SDcols')) lhs = sdvars else lhs = as.character(lhs) } else { - # e.g. (MyVar):= or get("MyVar"):= - lhs = eval(lhs, parent.frame(), parent.frame()) + if (deparse(lhs) == 'names(.SD)') { + lhs = sdvars + } else { + # e.g. (MyVar):= or get("MyVar"):= + lhs = eval(lhs, parent.frame(), parent.frame()) + } } } else { # `:=`(c2=1L,c3=2L,...) From e9ae7d3c5661821c8fc78fac063be9ce638f4930 Mon Sep 17 00:00:00 2001 From: Cole Miller <57992489+ColeMiller1@users.noreply.github.com> Date: Tue, 7 Jan 2020 19:49:55 -0500 Subject: [PATCH 02/42] Update tests.Rraw --- inst/tests/tests.Rraw | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 7fc969b55d..81f9f5e9b6 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -16682,3 +16682,12 @@ test(2129, rbind(A,B)$c3, expression(as.character(Sys.time()), as.character(Sys. ######################## # Add new tests here # ######################## + +## make names(.SD) work - issue #795 +DT <- data.table(a=1:6, b=1:6, c=rep(c(T,F), 3)) +mycols <- 1:2 +test(2131.1, DT[, names(.SD) :=lapply(.SD, `*`, 2), .SDcols=mycols], data.table(a = (1:6)*2, b = (1:6)*2, c = rep(c(T, F), 3))) +test(2131.2, DT[, names(.SD) := lapply(.SD, '*', 2), .SDcols = -3L], data.table(a = (1:6)*4, b = (1:6)*4, c = rep(c(T, F), 3))) +test(2131.3, DT[, .SD := lapply(.SD, '*', 2), .SDcols = -3L], data.table(a = (1:6)*8, b = (1:6)*8, c = rep(c(T, F), 3))) +test(2131.4, DT[, .SDcols := lapply(.SD, '*', 2), .SDcols = -3L], data.table(a = (1:6)*16, b = (1:6)*16, c = rep(c(T, F), 3))) +test(2131.5, DT[, .SD := lapply(.SD, '*', 2)], data.table(a = (1:6)*32, b = (1:6)*32, c = rep(c(T, F), 3) * 2)) From 17e80c41746a78d6e4940bb649f181bc3a330ced Mon Sep 17 00:00:00 2001 From: Cole Miller <57992489+ColeMiller1@users.noreply.github.com> Date: Tue, 7 Jan 2020 21:12:59 -0500 Subject: [PATCH 03/42] Update data.table.R --- R/data.table.R | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index 8f08415302..44f0cd08d1 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1027,14 +1027,14 @@ replace_dot_alias = function(e) { lhs = jsub[[2L]] jsub = jsub[[3L]] if (is.name(lhs)) { - if (deparse(lhs) %chin% c('.SD', '.SDcols')) lhs = sdvars else lhs = as.character(lhs) + if (deparse(lhs) == '.SD') lhs = sdvars else lhs = as.character(lhs) } else { - if (deparse(lhs) == 'names(.SD)') { - lhs = sdvars - } else { + # if (deparse(lhs) == 'names(.SD)') { + # lhs = sdvars + # } else { # e.g. (MyVar):= or get("MyVar"):= lhs = eval(lhs, parent.frame(), parent.frame()) - } + # } } } else { # `:=`(c2=1L,c3=2L,...) From d1c7a9933a3b4b4468cb964cad3130bb99079187 Mon Sep 17 00:00:00 2001 From: Cole Miller <57992489+ColeMiller1@users.noreply.github.com> Date: Tue, 7 Jan 2020 21:14:35 -0500 Subject: [PATCH 04/42] Update tests.Rraw --- inst/tests/tests.Rraw | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 81f9f5e9b6..7af565518d 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -16683,11 +16683,9 @@ test(2129, rbind(A,B)$c3, expression(as.character(Sys.time()), as.character(Sys. # Add new tests here # ######################## -## make names(.SD) work - issue #795 +## names(.SD) - issue #795 DT <- data.table(a=1:6, b=1:6, c=rep(c(T,F), 3)) mycols <- 1:2 -test(2131.1, DT[, names(.SD) :=lapply(.SD, `*`, 2), .SDcols=mycols], data.table(a = (1:6)*2, b = (1:6)*2, c = rep(c(T, F), 3))) -test(2131.2, DT[, names(.SD) := lapply(.SD, '*', 2), .SDcols = -3L], data.table(a = (1:6)*4, b = (1:6)*4, c = rep(c(T, F), 3))) -test(2131.3, DT[, .SD := lapply(.SD, '*', 2), .SDcols = -3L], data.table(a = (1:6)*8, b = (1:6)*8, c = rep(c(T, F), 3))) -test(2131.4, DT[, .SDcols := lapply(.SD, '*', 2), .SDcols = -3L], data.table(a = (1:6)*16, b = (1:6)*16, c = rep(c(T, F), 3))) -test(2131.5, DT[, .SD := lapply(.SD, '*', 2)], data.table(a = (1:6)*32, b = (1:6)*32, c = rep(c(T, F), 3) * 2)) +test(2131.1, DT[, .SD := lapply(.SD, `*`, 2), .SDcols = mycols], data.table(a = (1:6)*2, b = (1:6)*2, c = rep(c(T, F), 3))) +test(2131.3, DT[, .SD := lapply(.SD, '*', 2), .SDcols = -3L], data.table(a = (1:6)*4, b = (1:6)*4, c = rep(c(T, F), 3))) +test(2131.5, DT[, .SD := lapply(.SD, '*', 2)], data.table(a = (1:6)*8, b = (1:6)*8, c = rep(c(T, F), 3) * 2)) From 9d26aa7574b76faccedc3c800f887426415567b0 Mon Sep 17 00:00:00 2001 From: Cole Miller <57992489+ColeMiller1@users.noreply.github.com> Date: Tue, 7 Jan 2020 21:16:01 -0500 Subject: [PATCH 05/42] Update datatable-reference-semantics.Rmd --- vignettes/datatable-reference-semantics.Rmd | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/vignettes/datatable-reference-semantics.Rmd b/vignettes/datatable-reference-semantics.Rmd index a89538fba2..a8d1900f95 100644 --- a/vignettes/datatable-reference-semantics.Rmd +++ b/vignettes/datatable-reference-semantics.Rmd @@ -258,7 +258,7 @@ head(flights) * The `LHS := RHS` form allows us to operate on multiple columns. In the RHS, to compute the `max` on columns specified in `.SDcols`, we make use of the base function `lapply()` along with `.SD` in the same way as we have seen before in the *"Introduction to data.table"* vignette. It returns a list of two elements, containing the maximum value corresponding to `dep_delay` and `arr_delay` for each group. # -Before moving on to the next section, let's clean up the newly created columns `speed`, `max_speed`, `max_dep_delay` and `max_arr_delay`. +Let's clean up the newly created columns `speed`, `max_speed`, `max_dep_delay` and `max_arr_delay`. ```{r} # RHS gets automatically recycled to length of LHS @@ -266,6 +266,22 @@ flights[, c("speed", "max_speed", "max_dep_delay", "max_arr_delay") := NULL] head(flights) ``` +#### -- How can we update multiple existing columns in place using `.SD`? + +```{r} +char_cols <- sapply(flights, is.character) +flights[, .SD := lapply(.SD, as.factor), .SDcols = char_cols] +str(flights[, ..char_cols]) +``` +#### {.bs-callout .bs-callout-info} + +* We also could have used `(char_cols)` on the `LHS` but `.SD` is a shorthand. + +Let's clean up again and make our newly made factor columns back to character columns. +```{r} +flights[, .SD := lapply(.SD, as.character), .SDcols = char_cols] +str(flights[, ..char_cols]) +``` ## 3) `:=` and `copy()` `:=` modifies the input object by reference. Apart from the features we have discussed already, sometimes we might want to use the update by reference feature for its side effect. And at other times it may not be desirable to modify the original object, in which case we can use `copy()` function, as we will see in a moment. From 54f35f6894be313c4fe56ff1d02aabe812911f56 Mon Sep 17 00:00:00 2001 From: Cole Miller <57992489+ColeMiller1@users.noreply.github.com> Date: Tue, 7 Jan 2020 21:18:19 -0500 Subject: [PATCH 06/42] Update assign.Rd --- man/assign.Rd | 3 +++ 1 file changed, 3 insertions(+) diff --git a/man/assign.Rd b/man/assign.Rd index 4f2609c726..0267d5bed0 100644 --- a/man/assign.Rd +++ b/man/assign.Rd @@ -18,6 +18,9 @@ # LHS2 = RHS2, # ...), by = ...] +# 3. Multiple columns in place +# DT[i, .SD = lapply(.SD, fx), by = ..., .SDcols = ...] + set(x, i = NULL, j, value) } \arguments{ From 3c68d6e812e7fcb6e7ebcd83ca3df12d1387e45e Mon Sep 17 00:00:00 2001 From: Cole Miller <57992489+ColeMiller1@users.noreply.github.com> Date: Tue, 7 Jan 2020 21:23:28 -0500 Subject: [PATCH 07/42] Update NEWS.md --- NEWS.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/NEWS.md b/NEWS.md index d1b6b90bcd..7bc521d81c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -71,6 +71,8 @@ unit = "s") 9. `rbindlist` now supports columns of type `expression`, [#546](https://github.com/Rdatatable/data.table/issues/546). Thanks @jangorecki for the report. +10. Using `dt[, .SD := lapply(.SD, fx)]` now works, [#795](https://github.com/Rdatatable/data.table/issues/795). Thanks to @brodieG for the report and @ColeMiller1 for PR. + ## BUG FIXES 1. A NULL timezone on POSIXct was interpreted by `as.IDate` and `as.ITime` as UTC rather than the session's default timezone (`tz=""`) , [#4085](https://github.com/Rdatatable/data.table/issues/4085). From a009df040df5d28bbe3423270c39a8d4361b94b6 Mon Sep 17 00:00:00 2001 From: Cole Miller <57992489+ColeMiller1@users.noreply.github.com> Date: Tue, 7 Jan 2020 21:35:15 -0500 Subject: [PATCH 08/42] Update NEWS.md --- NEWS.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 7bc521d81c..c211a53684 100644 --- a/NEWS.md +++ b/NEWS.md @@ -71,7 +71,9 @@ unit = "s") 9. `rbindlist` now supports columns of type `expression`, [#546](https://github.com/Rdatatable/data.table/issues/546). Thanks @jangorecki for the report. -10. Using `dt[, .SD := lapply(.SD, fx)]` now works, [#795](https://github.com/Rdatatable/data.table/issues/795). Thanks to @brodieG for the report and @ColeMiller1 for PR. +10. The dimensions of objects in a list column are now displayed, #3671. Thanks to @randomgambit for the request, and Tyson Barrett for the PR. + +11. Using `dt[, .SD := lapply(.SD, fx)]` now works, [#795](https://github.com/Rdatatable/data.table/issues/795). Thanks to @brodieG for the report and @ColeMiller1 for PR. ## BUG FIXES From 18ccd2f5ac58d1540f8733c419978341fbdffbaa Mon Sep 17 00:00:00 2001 From: Cole Miller <57992489+ColeMiller1@users.noreply.github.com> Date: Wed, 8 Jan 2020 21:45:57 -0500 Subject: [PATCH 09/42] Update data.table.R --- R/data.table.R | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index 38c975dc6a..29072e92d6 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1027,14 +1027,14 @@ replace_dot_alias = function(e) { lhs = jsub[[2L]] jsub = jsub[[3L]] if (is.name(lhs)) { - if (deparse(lhs) == '.SD') lhs = sdvars else lhs = as.character(lhs) + if (lhs == as.name('.SD')) lhs = sdvars else lhs = as.character(lhs) } else { - # if (deparse(lhs) == 'names(.SD)') { - # lhs = sdvars - # } else { - # e.g. (MyVar):= or get("MyVar"):= - lhs = eval(lhs, parent.frame(), parent.frame()) - # } + #i.e lhs is names(.SD) || setdiff(names(.SD), cols) || (cols) + if (lhs[[1]] == as.name('names') && lhs[[2]] == as.name('.SD')) lhs = sdvars + for (i in seq_along(lhs)[-1]){ + if (lhs[[i]] == as.name('names(.SD)')) lhs[[i]] = sdvars + } + lhs = eval(lhs, parent.frame(), parent.frame()) } } else { # `:=`(c2=1L,c3=2L,...) From 15e95f83794c8d4c24c594ae22893fc9c6a8879b Mon Sep 17 00:00:00 2001 From: Cole Miller <57992489+ColeMiller1@users.noreply.github.com> Date: Wed, 8 Jan 2020 21:47:42 -0500 Subject: [PATCH 10/42] Update tests.Rraw --- inst/tests/tests.Rraw | 35 +++++++++++++++++++++++++++++------ 1 file changed, 29 insertions(+), 6 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index bf86178e34..9920d9ced6 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -16741,9 +16741,32 @@ test(2131, lapply(x[ , list(dt = list(.SD)), by = a]$dt, attr, '.data.table.lock # Add new tests here # ######################## -## names(.SD) - issue #795 -DT <- data.table(a=1:6, b=1:6, c=rep(c(T,F), 3)) -mycols <- 1:2 -test(2131.1, DT[, .SD := lapply(.SD, `*`, 2), .SDcols = mycols], data.table(a = (1:6)*2, b = (1:6)*2, c = rep(c(T, F), 3))) -test(2131.3, DT[, .SD := lapply(.SD, '*', 2), .SDcols = -3L], data.table(a = (1:6)*4, b = (1:6)*4, c = rep(c(T, F), 3))) -test(2131.5, DT[, .SD := lapply(.SD, '*', 2)], data.table(a = (1:6)*8, b = (1:6)*8, c = rep(c(T, F), 3) * 2)) +## make names(.SD) work - issue #795 could be merged with 299 tests above + +dt = data.table(A = 1:6, B = 6:11, C = 11:16, grp = c('c', 'b', 'b', 'a', 'a', 'a')) +cols = 1:2 + +test(2131.01, dt[, .SD := lapply(.SD, '*', 2), .SDcols = cols], data.table(A = (1:6) * 2, B = (6:11) * 2, C = (11:16), grp = c('c', 'b', 'b', 'a', 'a', 'a'))) +test(2131.02, dt[4:6, .SD := lapply(.SD, '/', 2), .SDcols = cols], data.table(A = c(1:3 * 2, 4:6), B = c(6:8 * 2, 9:11), C = (11:16), grp = c('c', 'b', 'b', 'a', 'a', 'a'))) +test(2131.03, dt[, names(.SD) := lapply(.SD, '*', 2), .SDcols = 2L], data.table(A = c(1:3 * 2, 4:6), B = c(6:8 * 2 * 2, 9:11 * 2), C = (11:16), grp = c('c', 'b', 'b', 'a', 'a', 'a'))) +test(2131.04, dt[, names(.SD) := lapply(.SD, '/', 2), .SDcols = 2L], data.table(A = c(1:3 * 2, 4:6), B = c(6:8 * 2, 9:11), C = (11:16), grp = c('c', 'b', 'b', 'a', 'a', 'a'))) +test(2131.1,dt[, .SD := lapply(.SD, sum), by = grp], data.table(A = c(2, 10, 10, 15, 15, 15), B = c(12, 30, 30, 30, 30, 30), C = c(11L, 25L, 25L, 45L, 45L, 45L), grp = c('c', 'b', 'b', 'a', 'a', 'a'))) + +dt[, grp := NULL] +test(2131.2, dt[, .SD := lapply(.SD, sum)], data.table(A = rep(sum(2, 10, 10, 15, 15, 15), 6L), B = rep(sum(12, 30, 30, 30, 30, 30), 6L), C = rep(sum(11L, 25L, 25L, 45L, 45L, 45L), 6L))) +test(2131.3, dt[, .SD := NULL, .SDcols = cols], data.table(C = rep(sum(11L, 25L, 25L, 45L, 45L, 45L), 6L))) + +dt = data.table(iris) +keep = c('Species', 'Sepal.Width') + +test(2131.4, dt[, .SD := NULL, .SDcols = !keep], data.table(iris[, c('Sepal.Width', 'Species')])) + +dt = data.table(iris) +test(2131.5, dt[, setdiff(names(.SD), keep) := NULL], data.table(iris[, c('Sepal.Width', 'Species')])) + +dt = data.table(iris) +test(2131.6, dt[, .SD := NULL, .SDcols = !keep], data.table(iris[, c('Sepal.Width', 'Species')])) + +dt = data.table(iris) +cols = c('Sepal.Length', 'Sepal.Width', 'Petal.Length', 'Petal.Width') +test(2131.7, dt[, paste(names(.SD), 'max', sep = '_') := lapply(.SD, max), by = Species, .SDcols = is.numeric],data.table(iris)[, paste(cols, 'max', sep = '_') := lapply(.SD, max), by = Species, .SDcols = cols]) From 112f81dc810e3603f7de5ba44798d8db79975f13 Mon Sep 17 00:00:00 2001 From: Cole Miller <57992489+ColeMiller1@users.noreply.github.com> Date: Wed, 8 Jan 2020 22:10:45 -0500 Subject: [PATCH 11/42] Update tests.Rraw --- inst/tests/tests.Rraw | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 9920d9ced6..783924e83f 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -16746,27 +16746,27 @@ test(2131, lapply(x[ , list(dt = list(.SD)), by = a]$dt, attr, '.data.table.lock dt = data.table(A = 1:6, B = 6:11, C = 11:16, grp = c('c', 'b', 'b', 'a', 'a', 'a')) cols = 1:2 -test(2131.01, dt[, .SD := lapply(.SD, '*', 2), .SDcols = cols], data.table(A = (1:6) * 2, B = (6:11) * 2, C = (11:16), grp = c('c', 'b', 'b', 'a', 'a', 'a'))) -test(2131.02, dt[4:6, .SD := lapply(.SD, '/', 2), .SDcols = cols], data.table(A = c(1:3 * 2, 4:6), B = c(6:8 * 2, 9:11), C = (11:16), grp = c('c', 'b', 'b', 'a', 'a', 'a'))) -test(2131.03, dt[, names(.SD) := lapply(.SD, '*', 2), .SDcols = 2L], data.table(A = c(1:3 * 2, 4:6), B = c(6:8 * 2 * 2, 9:11 * 2), C = (11:16), grp = c('c', 'b', 'b', 'a', 'a', 'a'))) -test(2131.04, dt[, names(.SD) := lapply(.SD, '/', 2), .SDcols = 2L], data.table(A = c(1:3 * 2, 4:6), B = c(6:8 * 2, 9:11), C = (11:16), grp = c('c', 'b', 'b', 'a', 'a', 'a'))) -test(2131.1,dt[, .SD := lapply(.SD, sum), by = grp], data.table(A = c(2, 10, 10, 15, 15, 15), B = c(12, 30, 30, 30, 30, 30), C = c(11L, 25L, 25L, 45L, 45L, 45L), grp = c('c', 'b', 'b', 'a', 'a', 'a'))) +test(2132.01, dt[, .SD := lapply(.SD, '*', 2), .SDcols = cols], data.table(A = (1:6) * 2, B = (6:11) * 2, C = (11:16), grp = c('c', 'b', 'b', 'a', 'a', 'a'))) +test(2132.02, dt[4:6, .SD := lapply(.SD, '/', 2), .SDcols = cols], data.table(A = c(1:3 * 2, 4:6), B = c(6:8 * 2, 9:11), C = (11:16), grp = c('c', 'b', 'b', 'a', 'a', 'a'))) +test(2132.03, dt[, names(.SD) := lapply(.SD, '*', 2), .SDcols = 2L], data.table(A = c(1:3 * 2, 4:6), B = c(6:8 * 2 * 2, 9:11 * 2), C = (11:16), grp = c('c', 'b', 'b', 'a', 'a', 'a'))) +test(2132.04, dt[, names(.SD) := lapply(.SD, '/', 2), .SDcols = 2L], data.table(A = c(1:3 * 2, 4:6), B = c(6:8 * 2, 9:11), C = (11:16), grp = c('c', 'b', 'b', 'a', 'a', 'a'))) +test(2132.10,dt[, .SD := lapply(.SD, sum), by = grp], data.table(A = c(2, 10, 10, 15, 15, 15), B = c(12, 30, 30, 30, 30, 30), C = c(11L, 25L, 25L, 45L, 45L, 45L), grp = c('c', 'b', 'b', 'a', 'a', 'a'))) dt[, grp := NULL] -test(2131.2, dt[, .SD := lapply(.SD, sum)], data.table(A = rep(sum(2, 10, 10, 15, 15, 15), 6L), B = rep(sum(12, 30, 30, 30, 30, 30), 6L), C = rep(sum(11L, 25L, 25L, 45L, 45L, 45L), 6L))) -test(2131.3, dt[, .SD := NULL, .SDcols = cols], data.table(C = rep(sum(11L, 25L, 25L, 45L, 45L, 45L), 6L))) +test(2132.20, dt[, .SD := lapply(.SD, sum)], data.table(A = rep(sum(2, 10, 10, 15, 15, 15), 6L), B = rep(sum(12, 30, 30, 30, 30, 30), 6L), C = rep(sum(11L, 25L, 25L, 45L, 45L, 45L), 6L))) +test(2132.30, dt[, .SD := NULL, .SDcols = cols], data.table(C = rep(sum(11L, 25L, 25L, 45L, 45L, 45L), 6L))) dt = data.table(iris) keep = c('Species', 'Sepal.Width') -test(2131.4, dt[, .SD := NULL, .SDcols = !keep], data.table(iris[, c('Sepal.Width', 'Species')])) +test(2132.40, dt[, .SD := NULL, .SDcols = !keep], data.table(iris[, c('Sepal.Width', 'Species')])) dt = data.table(iris) -test(2131.5, dt[, setdiff(names(.SD), keep) := NULL], data.table(iris[, c('Sepal.Width', 'Species')])) +test(2132.50, dt[, setdiff(names(.SD), keep) := NULL], data.table(iris[, c('Sepal.Width', 'Species')])) dt = data.table(iris) -test(2131.6, dt[, .SD := NULL, .SDcols = !keep], data.table(iris[, c('Sepal.Width', 'Species')])) +test(2132.60, dt[, .SD := NULL, .SDcols = !keep], data.table(iris[, c('Sepal.Width', 'Species')])) dt = data.table(iris) cols = c('Sepal.Length', 'Sepal.Width', 'Petal.Length', 'Petal.Width') -test(2131.7, dt[, paste(names(.SD), 'max', sep = '_') := lapply(.SD, max), by = Species, .SDcols = is.numeric],data.table(iris)[, paste(cols, 'max', sep = '_') := lapply(.SD, max), by = Species, .SDcols = cols]) +test(2132.70, dt[, paste(names(.SD), 'max', sep = '_') := lapply(.SD, max), by = Species, .SDcols = is.numeric],data.table(iris)[, paste(cols, 'max', sep = '_') := lapply(.SD, max), by = Species, .SDcols = cols]) From 2c39630b7dbea2acf81501570778c922111cafb5 Mon Sep 17 00:00:00 2001 From: Cole Miller <57992489+ColeMiller1@users.noreply.github.com> Date: Thu, 9 Jan 2020 21:26:41 -0500 Subject: [PATCH 12/42] Update data.table.R --- R/data.table.R | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index 29072e92d6..7a2cc3bd9b 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1030,11 +1030,13 @@ replace_dot_alias = function(e) { if (lhs == as.name('.SD')) lhs = sdvars else lhs = as.character(lhs) } else { #i.e lhs is names(.SD) || setdiff(names(.SD), cols) || (cols) - if (lhs[[1]] == as.name('names') && lhs[[2]] == as.name('.SD')) lhs = sdvars - for (i in seq_along(lhs)[-1]){ - if (lhs[[i]] == as.name('names(.SD)')) lhs[[i]] = sdvars + replace_names_sd = function(e){ + if (length(e) == 1L) return(e) + if (e[[1L]] == as.name('names') && e[[2L]] == as.name('.SD')) return(sdvars) + for (i in seq_along(e)[-1L]) if (!is.null(e[[i]])) e[[i]] = replace_names_sd(e[[i]]) + e } - lhs = eval(lhs, parent.frame(), parent.frame()) + lhs = eval(replace_names_sd(lhs), parent.frame(), parent.frame()) } } else { # `:=`(c2=1L,c3=2L,...) From fcb270a02ce18d215a510f3bd0fe9d17097d6868 Mon Sep 17 00:00:00 2001 From: Cole Miller <57992489+ColeMiller1@users.noreply.github.com> Date: Thu, 9 Jan 2020 21:28:32 -0500 Subject: [PATCH 13/42] Update tests.Rraw --- inst/tests/tests.Rraw | 31 ++++++++++++------------------- 1 file changed, 12 insertions(+), 19 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 783924e83f..21b7246fde 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -16741,32 +16741,25 @@ test(2131, lapply(x[ , list(dt = list(.SD)), by = a]$dt, attr, '.data.table.lock # Add new tests here # ######################## -## make names(.SD) work - issue #795 could be merged with 299 tests above +## make names(.SD) work - issue #795 -dt = data.table(A = 1:6, B = 6:11, C = 11:16, grp = c('c', 'b', 'b', 'a', 'a', 'a')) -cols = 1:2 +dt = data.table(a = 1:4, b = 5:8) -test(2132.01, dt[, .SD := lapply(.SD, '*', 2), .SDcols = cols], data.table(A = (1:6) * 2, B = (6:11) * 2, C = (11:16), grp = c('c', 'b', 'b', 'a', 'a', 'a'))) -test(2132.02, dt[4:6, .SD := lapply(.SD, '/', 2), .SDcols = cols], data.table(A = c(1:3 * 2, 4:6), B = c(6:8 * 2, 9:11), C = (11:16), grp = c('c', 'b', 'b', 'a', 'a', 'a'))) -test(2132.03, dt[, names(.SD) := lapply(.SD, '*', 2), .SDcols = 2L], data.table(A = c(1:3 * 2, 4:6), B = c(6:8 * 2 * 2, 9:11 * 2), C = (11:16), grp = c('c', 'b', 'b', 'a', 'a', 'a'))) -test(2132.04, dt[, names(.SD) := lapply(.SD, '/', 2), .SDcols = 2L], data.table(A = c(1:3 * 2, 4:6), B = c(6:8 * 2, 9:11), C = (11:16), grp = c('c', 'b', 'b', 'a', 'a', 'a'))) -test(2132.10,dt[, .SD := lapply(.SD, sum), by = grp], data.table(A = c(2, 10, 10, 15, 15, 15), B = c(12, 30, 30, 30, 30, 30), C = c(11L, 25L, 25L, 45L, 45L, 45L), grp = c('c', 'b', 'b', 'a', 'a', 'a'))) - -dt[, grp := NULL] -test(2132.20, dt[, .SD := lapply(.SD, sum)], data.table(A = rep(sum(2, 10, 10, 15, 15, 15), 6L), B = rep(sum(12, 30, 30, 30, 30, 30), 6L), C = rep(sum(11L, 25L, 25L, 45L, 45L, 45L), 6L))) -test(2132.30, dt[, .SD := NULL, .SDcols = cols], data.table(C = rep(sum(11L, 25L, 25L, 45L, 45L, 45L), 6L))) +test(2133.1, dt[, .SD := lapply(.SD, '*', 2), .SDcols = 1L], data.table(a = 1:4 * 2, b = 5:8)) +test(2133.2, dt[, .SD := lapply(.SD, '*', 2), .SDcols = 2L], data.table(a = 1:4 * 2, b = 5:8 * 2)) +test(2133.3, dt[, .SD := lapply(.SD, as.integer)], data.table(a = as.integer(1:4 * 2), b = as.integer(5:8 * 2))) +test(2133.4, dt[1L, names(.SD) := lapply(.SD, '+', 2L)], data.table(a = as.integer(c(4, 2:4 * 2)), b = as.integer(c(12, 6:8 * 2)))) +test(2133.5, dt[, setdiff(names(.SD), 'a') := NULL], data.table(a = as.integer(c(4, 2:4 * 2)))) +test(2133.6, dt[, c(names(.SD)) := NULL], null.data.table()) dt = data.table(iris) keep = c('Species', 'Sepal.Width') - -test(2132.40, dt[, .SD := NULL, .SDcols = !keep], data.table(iris[, c('Sepal.Width', 'Species')])) +test(2133.7, dt[, names(.SD) := lapply(.SD, max), by = Species], data.table(iris)[, names(iris)[-5] := lapply(.SD, max), by = Species]) dt = data.table(iris) -test(2132.50, dt[, setdiff(names(.SD), keep) := NULL], data.table(iris[, c('Sepal.Width', 'Species')])) - -dt = data.table(iris) -test(2132.60, dt[, .SD := NULL, .SDcols = !keep], data.table(iris[, c('Sepal.Width', 'Species')])) +keep = c('Species', 'Sepal.Width') +test(2133.8, dt[, .SD := NULL, .SDcols = !keep], data.table(iris[names(iris) %in% keep])) dt = data.table(iris) cols = c('Sepal.Length', 'Sepal.Width', 'Petal.Length', 'Petal.Width') -test(2132.70, dt[, paste(names(.SD), 'max', sep = '_') := lapply(.SD, max), by = Species, .SDcols = is.numeric],data.table(iris)[, paste(cols, 'max', sep = '_') := lapply(.SD, max), by = Species, .SDcols = cols]) +test(2133.9, dt[, paste(names(.SD), 'max', sep = '_') := lapply(.SD, max), by = Species] , data.table(iris)[, paste(cols, 'max', sep = '_') := lapply(.SD, max), by = Species]) From 21d3a933570829c76ca5680c2e1e42e1221553a5 Mon Sep 17 00:00:00 2001 From: Cole Miller <57992489+ColeMiller1@users.noreply.github.com> Date: Thu, 9 Jan 2020 22:18:15 -0500 Subject: [PATCH 14/42] replace iris with raw dataset --- inst/tests/tests.Rraw | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 21b7246fde..22eb7158bd 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -16752,14 +16752,12 @@ test(2133.4, dt[1L, names(.SD) := lapply(.SD, '+', 2L)], data.table(a = as.integ test(2133.5, dt[, setdiff(names(.SD), 'a') := NULL], data.table(a = as.integer(c(4, 2:4 * 2)))) test(2133.6, dt[, c(names(.SD)) := NULL], null.data.table()) -dt = data.table(iris) -keep = c('Species', 'Sepal.Width') -test(2133.7, dt[, names(.SD) := lapply(.SD, max), by = Species], data.table(iris)[, names(iris)[-5] := lapply(.SD, max), by = Species]) +dt = data.table(a = 1:4, b = 5:8, grp = c('a', 'a', 'b', 'c')) +test(2133.7, dt[, names(.SD) := lapply(.SD, max), by = grp], data.table(a = c(2L, 2L, 3L, 4L), b = c(6L, 6L, 7L, 8L), grp = c('a', 'a', 'b', 'c'))) -dt = data.table(iris) -keep = c('Species', 'Sepal.Width') -test(2133.8, dt[, .SD := NULL, .SDcols = !keep], data.table(iris[names(iris) %in% keep])) +dt = data.table(a = 1:4, b = 5:8, grp = c('a', 'a', 'b', 'c')) +keep = c('a', 'b') +test(2133.8, dt[, .SD := NULL, .SDcols = !keep], data.table(a = 1:4, b = 5:8)) -dt = data.table(iris) -cols = c('Sepal.Length', 'Sepal.Width', 'Petal.Length', 'Petal.Width') -test(2133.9, dt[, paste(names(.SD), 'max', sep = '_') := lapply(.SD, max), by = Species] , data.table(iris)[, paste(cols, 'max', sep = '_') := lapply(.SD, max), by = Species]) +dt = data.table(a = 1:4, b = 5:8, grp = c('a', 'a', 'b', 'c')) +test(2133.9, dt[, paste(names(.SD), 'max', sep = '_') := lapply(.SD, max), by = grp] , data.table(a = 1:4, b = 5:8, grp = c('a', 'a', 'b', 'c'))[, paste(c('a','b'), 'max', sep = '_') := lapply(.SD, max), by = grp]) From 10b36db30ab206d7c2acd6f636ddb10f4db241d6 Mon Sep 17 00:00:00 2001 From: Cole Miller <57992489+ColeMiller1@users.noreply.github.com> Date: Tue, 14 Jan 2020 18:18:45 -0500 Subject: [PATCH 15/42] Update tests.Rraw --- inst/tests/tests.Rraw | 3 +++ 1 file changed, 3 insertions(+) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 22eb7158bd..89a3c949e4 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -16761,3 +16761,6 @@ test(2133.8, dt[, .SD := NULL, .SDcols = !keep], data.table(a = 1:4, b = 5:8)) dt = data.table(a = 1:4, b = 5:8, grp = c('a', 'a', 'b', 'c')) test(2133.9, dt[, paste(names(.SD), 'max', sep = '_') := lapply(.SD, max), by = grp] , data.table(a = 1:4, b = 5:8, grp = c('a', 'a', 'b', 'c'))[, paste(c('a','b'), 'max', sep = '_') := lapply(.SD, max), by = grp]) + +dt = data.table(a = 1:3, b = 5:7, grp = c('a', 'a', 'b')) +test(2133.91, dt[1:2, paste(names(.SD), 'max', sep = '_') := lapply(.SD, max), by = grp], data.table(a = 1:3, b = 5:7, grp = c('a', 'a', 'b'), a_max = c(2L, 2L, NA_integer_), b_max = c(6L, 6L, NA_integer_))) From 799341922c99603056d48577a0215229d0dd8721 Mon Sep 17 00:00:00 2001 From: Cole Miller <57992489+ColeMiller1@users.noreply.github.com> Date: Sun, 19 Jan 2020 08:01:20 -0500 Subject: [PATCH 16/42] update replace_names_sd and made .SD := not work --- R/data.table.R | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index 7a2cc3bd9b..468d2a5eeb 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1027,16 +1027,16 @@ replace_dot_alias = function(e) { lhs = jsub[[2L]] jsub = jsub[[3L]] if (is.name(lhs)) { - if (lhs == as.name('.SD')) lhs = sdvars else lhs = as.character(lhs) + lhs = as.character(lhs) } else { #i.e lhs is names(.SD) || setdiff(names(.SD), cols) || (cols) - replace_names_sd = function(e){ + replace_names_sd = function(e, cols){ if (length(e) == 1L) return(e) - if (e[[1L]] == as.name('names') && e[[2L]] == as.name('.SD')) return(sdvars) - for (i in seq_along(e)[-1L]) if (!is.null(e[[i]])) e[[i]] = replace_names_sd(e[[i]]) + if (e[[1L]] == quote(names) && e[[2L]] == quote(.SD)) return(cols) + for (i in seq_along(e)[-1L]) if (!is.null(e[[i]])) e[[i]] = replace_names_sd(e[[i]], cols) e } - lhs = eval(replace_names_sd(lhs), parent.frame(), parent.frame()) + lhs = eval(replace_names_sd(lhs, sdvars), parent.frame(), parent.frame()) } } else { # `:=`(c2=1L,c3=2L,...) From 269967ef8d596a052038a7b5f11a615cc00b6118 Mon Sep 17 00:00:00 2001 From: Cole Miller <57992489+ColeMiller1@users.noreply.github.com> Date: Sun, 19 Jan 2020 08:02:18 -0500 Subject: [PATCH 17/42] change .SD to names(.SD) --- inst/tests/tests.Rraw | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 89a3c949e4..41cb71bc52 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -16745,9 +16745,9 @@ test(2131, lapply(x[ , list(dt = list(.SD)), by = a]$dt, attr, '.data.table.lock dt = data.table(a = 1:4, b = 5:8) -test(2133.1, dt[, .SD := lapply(.SD, '*', 2), .SDcols = 1L], data.table(a = 1:4 * 2, b = 5:8)) -test(2133.2, dt[, .SD := lapply(.SD, '*', 2), .SDcols = 2L], data.table(a = 1:4 * 2, b = 5:8 * 2)) -test(2133.3, dt[, .SD := lapply(.SD, as.integer)], data.table(a = as.integer(1:4 * 2), b = as.integer(5:8 * 2))) +test(2133.1, dt[, names(.SD) := lapply(.SD, '*', 2), .SDcols = 1L], data.table(a = 1:4 * 2, b = 5:8)) +test(2133.2, dt[, names(.SD) := lapply(.SD, '*', 2), .SDcols = 2L], data.table(a = 1:4 * 2, b = 5:8 * 2)) +test(2133.3, dt[, names(.SD) := lapply(.SD, as.integer)], data.table(a = as.integer(1:4 * 2), b = as.integer(5:8 * 2))) test(2133.4, dt[1L, names(.SD) := lapply(.SD, '+', 2L)], data.table(a = as.integer(c(4, 2:4 * 2)), b = as.integer(c(12, 6:8 * 2)))) test(2133.5, dt[, setdiff(names(.SD), 'a') := NULL], data.table(a = as.integer(c(4, 2:4 * 2)))) test(2133.6, dt[, c(names(.SD)) := NULL], null.data.table()) @@ -16757,7 +16757,7 @@ test(2133.7, dt[, names(.SD) := lapply(.SD, max), by = grp], data.table(a = c(2L dt = data.table(a = 1:4, b = 5:8, grp = c('a', 'a', 'b', 'c')) keep = c('a', 'b') -test(2133.8, dt[, .SD := NULL, .SDcols = !keep], data.table(a = 1:4, b = 5:8)) +test(2133.8, dt[, names(.SD) := NULL, .SDcols = !keep], data.table(a = 1:4, b = 5:8)) dt = data.table(a = 1:4, b = 5:8, grp = c('a', 'a', 'b', 'c')) test(2133.9, dt[, paste(names(.SD), 'max', sep = '_') := lapply(.SD, max), by = grp] , data.table(a = 1:4, b = 5:8, grp = c('a', 'a', 'b', 'c'))[, paste(c('a','b'), 'max', sep = '_') := lapply(.SD, max), by = grp]) From 76b5e64f34fb9ee0aac68f80cdc7cb6dfeae3216 Mon Sep 17 00:00:00 2001 From: Cole Miller <57992489+ColeMiller1@users.noreply.github.com> Date: Sun, 19 Jan 2020 08:04:18 -0500 Subject: [PATCH 18/42] update typo; change .SD to names(.SD) --- man/assign.Rd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/man/assign.Rd b/man/assign.Rd index 0267d5bed0..645c5d5738 100644 --- a/man/assign.Rd +++ b/man/assign.Rd @@ -19,7 +19,7 @@ # ...), by = ...] # 3. Multiple columns in place -# DT[i, .SD = lapply(.SD, fx), by = ..., .SDcols = ...] +# DT[i, names(.SD) := lapply(.SD, fx), by = ...] set(x, i = NULL, j, value) } From ed879f6528519f598ace3b899ee6e699cf000f63 Mon Sep 17 00:00:00 2001 From: Cole Miller <57992489+ColeMiller1@users.noreply.github.com> Date: Sun, 19 Jan 2020 08:10:17 -0500 Subject: [PATCH 19/42] update to names(.SD) --- vignettes/datatable-reference-semantics.Rmd | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vignettes/datatable-reference-semantics.Rmd b/vignettes/datatable-reference-semantics.Rmd index a8d1900f95..c19c237c78 100644 --- a/vignettes/datatable-reference-semantics.Rmd +++ b/vignettes/datatable-reference-semantics.Rmd @@ -270,16 +270,16 @@ head(flights) ```{r} char_cols <- sapply(flights, is.character) -flights[, .SD := lapply(.SD, as.factor), .SDcols = char_cols] +flights[, names(.SD) := lapply(.SD, as.factor), .SDcols = char_cols] str(flights[, ..char_cols]) ``` #### {.bs-callout .bs-callout-info} -* We also could have used `(char_cols)` on the `LHS` but `.SD` is a shorthand. +* We also could have used `(char_cols)` on the `LHS`. Let's clean up again and make our newly made factor columns back to character columns. ```{r} -flights[, .SD := lapply(.SD, as.character), .SDcols = char_cols] +flights[, names(.SD) := lapply(.SD, as.character), .SDcols = char_cols] str(flights[, ..char_cols]) ``` ## 3) `:=` and `copy()` From 1fbd6317a31147c9ccf12e51388cf46325dfab85 Mon Sep 17 00:00:00 2001 From: Cole Miller <57992489+ColeMiller1@users.noreply.github.com> Date: Mon, 20 Jan 2020 20:40:28 -0500 Subject: [PATCH 20/42] include names(.SD) and fx to .SD usage I may have went too far. There's no use of ```(cols) := ...``` now but there is at least a reference to the other vignette. --- vignettes/datatable-sd-usage.Rmd | 50 ++++++++++++++------------------ 1 file changed, 22 insertions(+), 28 deletions(-) diff --git a/vignettes/datatable-sd-usage.Rmd b/vignettes/datatable-sd-usage.Rmd index 8f23c58554..391dcb105f 100644 --- a/vignettes/datatable-sd-usage.Rmd +++ b/vignettes/datatable-sd-usage.Rmd @@ -73,7 +73,13 @@ The first way to impact what `.SD` is is to limit the _columns_ contained in `.S Pitching[ , .SD, .SDcols = c('W', 'L', 'G')] ``` -This is just for illustration and was pretty boring. But even this simply usage lends itself to a wide variety of highly beneficial / ubiquitous data manipulation operations: +This is just for illustration and was pretty boring. In addition to accepting a character vector, `.SDcols` also accepts: + +1. a function such as `is.character` +2. name searching with `patterns()` +3. Integer and boolean vectors. + +This simple usage lends itself to a wide variety of highly beneficial / ubiquitous data manipulation operations: ## Column Type Conversion @@ -87,52 +93,40 @@ We notice that the following columns are stored as `character` in the `Teams` da # teamIDretro: Team ID used by Retrosheet fkt = c('teamIDBR', 'teamIDlahman45', 'teamIDretro') # confirm that they're stored as `character` -Teams[ , sapply(.SD, is.character), .SDcols = fkt] -``` - -If you're confused by the use of `sapply` here, note that it's quite similar for base R `data.frames`: - -```{r identify_factors_as_df} -setDF(Teams) # convert to data.frame for illustration -sapply(Teams[ , fkt], is.character) -setDT(Teams) # convert back to data.table +str(Teams[ , ..fkt]) ``` -The key to understanding this syntax is to recall that a `data.table` (as well as a `data.frame`) can be considered as a `list` where each element is a column -- thus, `sapply`/`lapply` applies the `FUN` argument (in this case, `is.character`) to each _column_ and returns the result as `sapply`/`lapply` usually would. - -The syntax to now convert these columns to `factor` is very similar -- simply add the `:=` assignment operator: +The syntax to now convert these columns to `factor` is simple: ```{r assign_factors} -Teams[ , (fkt) := lapply(.SD, factor), .SDcols = fkt] +Teams[ , names(.SD) := lapply(.SD, factor), .SDcols = patterns('teamID')] # print out the first column to demonstrate success head(unique(Teams[[fkt[1L]]])) ``` -Note that we must wrap `fkt` in parentheses `()` to force `data.table` to interpret this as column names, instead of trying to assign a column named `'fkt'`. +Note: -Actually, the `.SDcols` argument is quite flexible; above, we supplied a `character` vector of column names. In other situations, it is more convenient to supply an `integer` vector of column _positions_ or a `logical` vector dictating include/exclude for each column. `.SDcols` even accepts regular expression-based pattern matching. +1. The `:=` is an assignment operator to update the `data.table` in place without making a copy. See [reference semantics](https://cran.r-project.org/web/packages/data.table/vignettes/datatable-reference-semantics.html) for more. +2. `names(.SD)` indicates which columns we are updating - in this case we update all entire `.SD`. +3. `lapply()` loops through each column of the `.SD` and converts the column to a factor. +4. We use the `.SDcols` to only select columns that have pattern of `teamID`. + +Again, the `.SDcols` argument is quite flexible; above, we supplied `patterns` but we could have also supplied `fkt` or any `character` vector of column names. In other situations, it is more convenient to supply an `integer` vector of column _positions_ or a `logical` vector dictating include/exclude for each column. Finally, the use of a function is very helpful. For example, we could do the following to convert all `factor` columns to `character`: ```{r sd_as_logical} -# while .SDcols accepts a logical vector, -# := does not, so we need to convert to column -# positions with which() -fkt_idx = which(sapply(Teams, is.factor)) -Teams[ , (fkt_idx) := lapply(.SD, as.character), .SDcols = fkt_idx] -head(unique(Teams[[fkt_idx[1L]]])) +fct_idx = Teams[, which(sapply(.SD, is.factor))] #column numbers to show the class changing +str(Teams[[fct_idx[1L]]]) +Teams[ , names(.SD) := lapply(.SD, as.character), .SDcols = is.factor] +str(Teams[[fct_idx[1L]]]) ``` Lastly, we can do pattern-based matching of columns in `.SDcols` to select all columns which contain `team` back to `factor`: ```{r sd_patterns} Teams[ , .SD, .SDcols = patterns('team')] - -# now convert these columns to factor; -# value = TRUE in grep() is for the LHS of := to -# get column names instead of positions -team_idx = grep('team', names(Teams), value = TRUE) -Teams[ , (team_idx) := lapply(.SD, factor), .SDcols = team_idx] +Teams[ , names(.SD) := lapply(.SD, factor), .SDcols = patterns('team')] ``` ** A proviso to the above: _explicitly_ using column numbers (like `DT[ , (1) := rnorm(.N)]`) is bad practice and can lead to silently corrupted code over time if column positions change. Even implicitly using numbers can be dangerous if we don't keep smart/strict control over the ordering of when we create the numbered index and when we use it. From 8df7af526d56d9941310c6910d169a0ec55198d9 Mon Sep 17 00:00:00 2001 From: Cole Miller <57992489+ColeMiller1@users.noreply.github.com> Date: Tue, 21 Jan 2020 06:31:24 -0500 Subject: [PATCH 21/42] Updates news to names(.SD) --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 2e456cf12c..faebdd2470 100644 --- a/NEWS.md +++ b/NEWS.md @@ -73,7 +73,7 @@ unit = "s") 10. The dimensions of objects in a `list` column are now displayed, [#3671](https://github.com/Rdatatable/data.table/issues/3671). Thanks to @randomgambit for the request, and Tyson Barrett for the PR. -11. Using `dt[, .SD := lapply(.SD, fx)]` now works, [#795](https://github.com/Rdatatable/data.table/issues/795). Thanks to @brodieG for the report and @ColeMiller1 for PR. +11. Using `dt[, names(.SD) := lapply(.SD, fx)]` now works, [#795](https://github.com/Rdatatable/data.table/issues/795). Thanks to @brodieG for the report and @ColeMiller1 for PR. ## BUG FIXES From 8c2d2730b5af9195c41b4e505208298a78250623 Mon Sep 17 00:00:00 2001 From: Cole Miller <57992489+ColeMiller1@users.noreply.github.com> Date: Wed, 29 Jan 2020 22:36:38 -0500 Subject: [PATCH 22/42] Update typo. --- vignettes/datatable-sd-usage.Rmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vignettes/datatable-sd-usage.Rmd b/vignettes/datatable-sd-usage.Rmd index 391dcb105f..25c51c401d 100644 --- a/vignettes/datatable-sd-usage.Rmd +++ b/vignettes/datatable-sd-usage.Rmd @@ -107,7 +107,7 @@ head(unique(Teams[[fkt[1L]]])) Note: 1. The `:=` is an assignment operator to update the `data.table` in place without making a copy. See [reference semantics](https://cran.r-project.org/web/packages/data.table/vignettes/datatable-reference-semantics.html) for more. -2. `names(.SD)` indicates which columns we are updating - in this case we update all entire `.SD`. +2. `names(.SD)` indicates which columns we are updating - in this case we update the entire `.SD`. 3. `lapply()` loops through each column of the `.SD` and converts the column to a factor. 4. We use the `.SDcols` to only select columns that have pattern of `teamID`. From 7267766a34381592ec418fcd0aea77b6fb52c2a4 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Sun, 2 Feb 2020 16:30:46 +0800 Subject: [PATCH 23/42] tweak NEWS --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index faebdd2470..bed817f8b3 100644 --- a/NEWS.md +++ b/NEWS.md @@ -73,7 +73,7 @@ unit = "s") 10. The dimensions of objects in a `list` column are now displayed, [#3671](https://github.com/Rdatatable/data.table/issues/3671). Thanks to @randomgambit for the request, and Tyson Barrett for the PR. -11. Using `dt[, names(.SD) := lapply(.SD, fx)]` now works, [#795](https://github.com/Rdatatable/data.table/issues/795). Thanks to @brodieG for the report and @ColeMiller1 for PR. +11. Using `dt[, names(.SD) := lapply(.SD, fx)]` now works, [#795](https://github.com/Rdatatable/data.table/issues/795) -- one of our [most-requested issues (see #3189)](https://github.com/Rdatatable/data.table/issues/3189). Thanks to @brodieG for the report and @ColeMiller1 for PR. ## BUG FIXES From 197cb5452d6416d870a1575ffaa0133138060d60 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Sun, 2 Feb 2020 16:40:00 +0800 Subject: [PATCH 24/42] minor grammar --- vignettes/datatable-sd-usage.Rmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vignettes/datatable-sd-usage.Rmd b/vignettes/datatable-sd-usage.Rmd index 25c51c401d..3b31de095d 100644 --- a/vignettes/datatable-sd-usage.Rmd +++ b/vignettes/datatable-sd-usage.Rmd @@ -111,7 +111,7 @@ Note: 3. `lapply()` loops through each column of the `.SD` and converts the column to a factor. 4. We use the `.SDcols` to only select columns that have pattern of `teamID`. -Again, the `.SDcols` argument is quite flexible; above, we supplied `patterns` but we could have also supplied `fkt` or any `character` vector of column names. In other situations, it is more convenient to supply an `integer` vector of column _positions_ or a `logical` vector dictating include/exclude for each column. Finally, the use of a function is very helpful. +Again, the `.SDcols` argument is quite flexible; above, we supplied `patterns` but we could have also supplied `fkt` or any `character` vector of column names. In other situations, it is more convenient to supply an `integer` vector of column _positions_ or a `logical` vector dictating include/exclude for each column. Finally, the use of a function to filter columns is very helpful. For example, we could do the following to convert all `factor` columns to `character`: From 8d7f2327399f06b15bf3030750c756dc996640ef Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Sun, 2 Feb 2020 16:40:36 +0800 Subject: [PATCH 25/42] jans comment --- vignettes/datatable-sd-usage.Rmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vignettes/datatable-sd-usage.Rmd b/vignettes/datatable-sd-usage.Rmd index 3b31de095d..a08679e21e 100644 --- a/vignettes/datatable-sd-usage.Rmd +++ b/vignettes/datatable-sd-usage.Rmd @@ -116,7 +116,7 @@ Again, the `.SDcols` argument is quite flexible; above, we supplied `patterns` b For example, we could do the following to convert all `factor` columns to `character`: ```{r sd_as_logical} -fct_idx = Teams[, which(sapply(.SD, is.factor))] #column numbers to show the class changing +fct_idx = Teams[, which(sapply(.SD, is.factor))] # column numbers to show the class changing str(Teams[[fct_idx[1L]]]) Teams[ , names(.SD) := lapply(.SD, as.character), .SDcols = is.factor] str(Teams[[fct_idx[1L]]]) From 29cc659526b7eba495c783037b048ac34acd90b2 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Sun, 2 Feb 2020 16:42:59 +0800 Subject: [PATCH 26/42] jan's comment (ii) --- vignettes/datatable-sd-usage.Rmd | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vignettes/datatable-sd-usage.Rmd b/vignettes/datatable-sd-usage.Rmd index a08679e21e..eca232dfd7 100644 --- a/vignettes/datatable-sd-usage.Rmd +++ b/vignettes/datatable-sd-usage.Rmd @@ -75,9 +75,9 @@ Pitching[ , .SD, .SDcols = c('W', 'L', 'G')] This is just for illustration and was pretty boring. In addition to accepting a character vector, `.SDcols` also accepts: -1. a function such as `is.character` -2. name searching with `patterns()` -3. Integer and boolean vectors. +1. any function such as `is.character` to filter _columns_ +2. the function^{*} `patterns()` to filter by _column names_ +3. integer and logical vectors This simple usage lends itself to a wide variety of highly beneficial / ubiquitous data manipulation operations: From f7adef8527f96e73a9eccef17bd16709294b2a60 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Sun, 2 Feb 2020 16:47:54 +0800 Subject: [PATCH 27/42] added "footnote" --- vignettes/datatable-sd-usage.Rmd | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vignettes/datatable-sd-usage.Rmd b/vignettes/datatable-sd-usage.Rmd index eca232dfd7..9c4140f77e 100644 --- a/vignettes/datatable-sd-usage.Rmd +++ b/vignettes/datatable-sd-usage.Rmd @@ -79,6 +79,8 @@ This is just for illustration and was pretty boring. In addition to accepting a 2. the function^{*} `patterns()` to filter by _column names_ 3. integer and logical vectors +*see `?patterns` for more details + This simple usage lends itself to a wide variety of highly beneficial / ubiquitous data manipulation operations: ## Column Type Conversion From 9469e4e1897fd14b198a8aa513049e532dfda303 Mon Sep 17 00:00:00 2001 From: Cole Miller <57992489+ColeMiller1@users.noreply.github.com> Date: Sun, 2 Feb 2020 07:22:06 -0500 Subject: [PATCH 28/42] Add is.name(e[[2L]]) --- R/data.table.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/data.table.R b/R/data.table.R index 468d2a5eeb..143604ce4d 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1032,7 +1032,7 @@ replace_dot_alias = function(e) { #i.e lhs is names(.SD) || setdiff(names(.SD), cols) || (cols) replace_names_sd = function(e, cols){ if (length(e) == 1L) return(e) - if (e[[1L]] == quote(names) && e[[2L]] == quote(.SD)) return(cols) + if (e[[1L]] == quote(names) && is.name(e[[2L]]) && e[[2L]] == quote(.SD)) return(cols) for (i in seq_along(e)[-1L]) if (!is.null(e[[i]])) e[[i]] = replace_names_sd(e[[i]], cols) e } From 3ba55186196cace7c7a51e876f48b39b6a18cc37 Mon Sep 17 00:00:00 2001 From: Cole Miller <57992489+ColeMiller1@users.noreply.github.com> Date: Sun, 2 Feb 2020 07:38:18 -0500 Subject: [PATCH 29/42] Put tests above Add new tests here --- inst/tests/tests.Rraw | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 41cb71bc52..312f41fae5 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -16736,13 +16736,7 @@ x = data.table(a=1:3, b=4:6) test(2131, lapply(x[ , list(dt = list(.SD)), by = a]$dt, attr, '.data.table.locked'), list(NULL, NULL, NULL)) - -######################## -# Add new tests here # -######################## - ## make names(.SD) work - issue #795 - dt = data.table(a = 1:4, b = 5:8) test(2133.1, dt[, names(.SD) := lapply(.SD, '*', 2), .SDcols = 1L], data.table(a = 1:4 * 2, b = 5:8)) @@ -16764,3 +16758,8 @@ test(2133.9, dt[, paste(names(.SD), 'max', sep = '_') := lapply(.SD, max), by = dt = data.table(a = 1:3, b = 5:7, grp = c('a', 'a', 'b')) test(2133.91, dt[1:2, paste(names(.SD), 'max', sep = '_') := lapply(.SD, max), by = grp], data.table(a = 1:3, b = 5:7, grp = c('a', 'a', 'b'), a_max = c(2L, 2L, NA_integer_), b_max = c(6L, 6L, NA_integer_))) + +######################## +# Add new tests here # +######################## + From 8e1c109ecc9a433c8c7d30b4f335786750a38a44 Mon Sep 17 00:00:00 2001 From: Cole Miller <57992489+ColeMiller1@users.noreply.github.com> Date: Sun, 2 Feb 2020 07:39:14 -0500 Subject: [PATCH 30/42] added test to test names(.SD(2)) --- inst/tests/tests.Rraw | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 312f41fae5..9e177f896d 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -16758,8 +16758,8 @@ test(2133.9, dt[, paste(names(.SD), 'max', sep = '_') := lapply(.SD, max), by = dt = data.table(a = 1:3, b = 5:7, grp = c('a', 'a', 'b')) test(2133.91, dt[1:2, paste(names(.SD), 'max', sep = '_') := lapply(.SD, max), by = grp], data.table(a = 1:3, b = 5:7, grp = c('a', 'a', 'b'), a_max = c(2L, 2L, NA_integer_), b_max = c(6L, 6L, NA_integer_))) +test(2133.92, dt[, names(.SD(2)) := lapply(.SD, .I)], error = "could not find function \".SD\"") ######################## # Add new tests here # ######################## - From c389b3cd6ed64e2c3b6df358ce464d2d7ef4fc41 Mon Sep 17 00:00:00 2001 From: Cole Miller <57992489+ColeMiller1@users.noreply.github.com> Date: Sun, 2 Feb 2020 07:47:09 -0500 Subject: [PATCH 31/42] include .SDcols in example for assign --- man/assign.Rd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/man/assign.Rd b/man/assign.Rd index 645c5d5738..4b6252cda3 100644 --- a/man/assign.Rd +++ b/man/assign.Rd @@ -19,7 +19,7 @@ # ...), by = ...] # 3. Multiple columns in place -# DT[i, names(.SD) := lapply(.SD, fx), by = ...] +# DT[i, names(.SD) := lapply(.SD, fx), by = ..., .SDcols = ...] set(x, i = NULL, j, value) } From 2c3fb513e577345ccb56e9ec421fea2b6c3c372c Mon Sep 17 00:00:00 2001 From: Cole Miller <57992489+ColeMiller1@users.noreply.github.com> Date: Sun, 2 Feb 2020 08:15:50 -0500 Subject: [PATCH 32/42] included .SDcols = function example --- vignettes/datatable-reference-semantics.Rmd | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vignettes/datatable-reference-semantics.Rmd b/vignettes/datatable-reference-semantics.Rmd index c19c237c78..89635ed79b 100644 --- a/vignettes/datatable-reference-semantics.Rmd +++ b/vignettes/datatable-reference-semantics.Rmd @@ -277,10 +277,10 @@ str(flights[, ..char_cols]) * We also could have used `(char_cols)` on the `LHS`. -Let's clean up again and make our newly made factor columns back to character columns. +Let's clean up again and make our newly made factor columns back to character columns. This time we will make use of `.SDcols` accepting a function to return columns. In this case, `is.factor` will return the columns which are factors. For more on the **S**ubset of the **D**ata, there is also an [SD Usage vignette](https://cran.r-project.org/web/packages/data.table/vignettes/datatable-sd-usage.html). ```{r} -flights[, names(.SD) := lapply(.SD, as.character), .SDcols = char_cols] -str(flights[, ..char_cols]) +flights[, names(.SD) := lapply(.SD, as.character), .SDcols = is.factor] +str(flights[, .SD, .SDcols = is.character]) ``` ## 3) `:=` and `copy()` From f5ab271b877ed79116035b999ba7d70e5ed01c6b Mon Sep 17 00:00:00 2001 From: Cole Miller <57992489+ColeMiller1@users.noreply.github.com> Date: Wed, 26 Feb 2020 20:40:05 -0500 Subject: [PATCH 33/42] test 2138 is greater than 2137 --- inst/tests/tests.Rraw | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index b64f2ed77b..a039023c60 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -16853,23 +16853,23 @@ test(2138.4, rbind(A,B), data.table(A=c(as.character(A$A), B$A))) # make names(.SD) work - issue #795 dt = data.table(a = 1:4, b = 5:8) -test(2137.1, dt[, names(.SD) := lapply(.SD, '*', 2), .SDcols = 1L], data.table(a = 1:4 * 2, b = 5:8)) -test(2137.2, dt[, names(.SD) := lapply(.SD, '*', 2), .SDcols = 2L], data.table(a = 1:4 * 2, b = 5:8 * 2)) -test(2137.3, dt[, names(.SD) := lapply(.SD, as.integer)], data.table(a = as.integer(1:4 * 2), b = as.integer(5:8 * 2))) -test(2137.4, dt[1L, names(.SD) := lapply(.SD, '+', 2L)], data.table(a = as.integer(c(4, 2:4 * 2)), b = as.integer(c(12, 6:8 * 2)))) -test(2137.5, dt[, setdiff(names(.SD), 'a') := NULL], data.table(a = as.integer(c(4, 2:4 * 2)))) -test(2137.6, dt[, c(names(.SD)) := NULL], null.data.table()) +test(2139.1, dt[, names(.SD) := lapply(.SD, '*', 2), .SDcols = 1L], data.table(a = 1:4 * 2, b = 5:8)) +test(2139.2, dt[, names(.SD) := lapply(.SD, '*', 2), .SDcols = 2L], data.table(a = 1:4 * 2, b = 5:8 * 2)) +test(2139.3, dt[, names(.SD) := lapply(.SD, as.integer)], data.table(a = as.integer(1:4 * 2), b = as.integer(5:8 * 2))) +test(2139.4, dt[1L, names(.SD) := lapply(.SD, '+', 2L)], data.table(a = as.integer(c(4, 2:4 * 2)), b = as.integer(c(12, 6:8 * 2)))) +test(2139.5, dt[, setdiff(names(.SD), 'a') := NULL], data.table(a = as.integer(c(4, 2:4 * 2)))) +test(2139.6, dt[, c(names(.SD)) := NULL], null.data.table()) dt = data.table(a = 1:4, b = 5:8, grp = c('a', 'a', 'b', 'c')) -test(2137.7, dt[, names(.SD) := lapply(.SD, max), by = grp], data.table(a = c(2L, 2L, 3L, 4L), b = c(6L, 6L, 7L, 8L), grp = c('a', 'a', 'b', 'c'))) +test(2139.7, dt[, names(.SD) := lapply(.SD, max), by = grp], data.table(a = c(2L, 2L, 3L, 4L), b = c(6L, 6L, 7L, 8L), grp = c('a', 'a', 'b', 'c'))) dt = data.table(a = 1:4, b = 5:8, grp = c('a', 'a', 'b', 'c')) keep = c('a', 'b') -test(2137.8, dt[, names(.SD) := NULL, .SDcols = !keep], data.table(a = 1:4, b = 5:8)) +test(2139.8, dt[, names(.SD) := NULL, .SDcols = !keep], data.table(a = 1:4, b = 5:8)) dt = data.table(a = 1:4, b = 5:8, grp = c('a', 'a', 'b', 'c')) -test(2137.9, dt[, paste(names(.SD), 'max', sep = '_') := lapply(.SD, max), by = grp] , data.table(a = 1:4, b = 5:8, grp = c('a', 'a', 'b', 'c'))[, paste(c('a','b'), 'max', sep = '_') := lapply(.SD, max), by = grp]) +test(2139.9, dt[, paste(names(.SD), 'max', sep = '_') := lapply(.SD, max), by = grp] , data.table(a = 1:4, b = 5:8, grp = c('a', 'a', 'b', 'c'))[, paste(c('a','b'), 'max', sep = '_') := lapply(.SD, max), by = grp]) dt = data.table(a = 1:3, b = 5:7, grp = c('a', 'a', 'b')) -test(2137.91, dt[1:2, paste(names(.SD), 'max', sep = '_') := lapply(.SD, max), by = grp], data.table(a = 1:3, b = 5:7, grp = c('a', 'a', 'b'), a_max = c(2L, 2L, NA_integer_), b_max = c(6L, 6L, NA_integer_))) -test(2137.92, dt[, names(.SD(2)) := lapply(.SD, .I)], error = "could not find function \".SD\"") +test(2139.91, dt[1:2, paste(names(.SD), 'max', sep = '_') := lapply(.SD, max), by = grp], data.table(a = 1:3, b = 5:7, grp = c('a', 'a', 'b'), a_max = c(2L, 2L, NA_integer_), b_max = c(6L, 6L, NA_integer_))) +test(2139.92, dt[, names(.SD(2)) := lapply(.SD, .I)], error = "could not find function \".SD\"") From be720a31c76333b33538fd2510af81145e28e970 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Mon, 26 Feb 2024 22:01:41 -0800 Subject: [PATCH 34/42] bad merge --- vignettes/datatable-reference-semantics.Rmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vignettes/datatable-reference-semantics.Rmd b/vignettes/datatable-reference-semantics.Rmd index 7b10a655bd..ab409bdda8 100644 --- a/vignettes/datatable-reference-semantics.Rmd +++ b/vignettes/datatable-reference-semantics.Rmd @@ -250,7 +250,7 @@ head(flights) * The `LHS := RHS` form allows us to operate on multiple columns. In the RHS, to compute the `max` on columns specified in `.SDcols`, we make use of the base function `lapply()` along with `.SD` in the same way as we have seen before in the *"Introduction to data.table"* vignette. It returns a list of two elements, containing the maximum value corresponding to `dep_delay` and `arr_delay` for each group. # -Let's clean up the newly created columns `speed`, `max_speed`, `max_dep_delay` and `max_arr_delay`. +Before moving on to the next section, let's clean up the newly created columns `speed`, `max_speed`, `max_dep_delay` and `max_arr_delay`. ```{r} # RHS gets automatically recycled to length of LHS From 7b0f8f14893f225d3c8dcb934827ddb2bd1dbf3c Mon Sep 17 00:00:00 2001 From: Cole Miller Date: Mon, 18 Mar 2024 22:33:51 -0400 Subject: [PATCH 35/42] Make updates per Michael's comments. --- R/data.table.R | 6 ++--- inst/tests/tests.Rraw | 25 +++++++++------------ vignettes/datatable-reference-semantics.Rmd | 19 ++++++++-------- 3 files changed, 24 insertions(+), 26 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index b9358d08c2..45ec4a1924 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1122,11 +1122,11 @@ replace_dot_alias = function(e) { if (is.name(lhs)) { lhs = as.character(lhs) } else { - #i.e lhs is names(.SD) || setdiff(names(.SD), cols) || (cols) + # i.e lhs is names(.SD) || setdiff(names(.SD), cols) || (cols) replace_names_sd = function(e, cols){ if (length(e) == 1L) return(e) - if (e[[1L]] == quote(names) && is.name(e[[2L]]) && e[[2L]] == quote(.SD)) return(cols) - for (i in seq_along(e)[-1L]) if (!is.null(e[[i]])) e[[i]] = replace_names_sd(e[[i]], cols) + if (e %iscall% "names" && is.name(e2 <- e[[2L]]) && e2 == ".SD") return(cols) + for (i in 2:length(e)) if (!is.null(e[[i]])) e[[i]] = replace_names_sd(e[[i]], cols) e } lhs = eval(replace_names_sd(lhs, sdvars), parent.frame(), parent.frame()) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 377de0bcc0..8580ca22cb 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -16704,9 +16704,6 @@ dt = data.table(x = rep(1:3, each = 3), y = runif(9)) out = dt[, list(evaluated = list(f(copy(.SD)))), by = x] test(2131.2, class(out$evaluated[[1L]]), 'environment') - - - # S4 object not suported in fifelse and fcase, #4135 class2132 = setClass("class2132", slots=list(x="numeric")) s1 = class2132(x=20191231) @@ -18301,23 +18298,23 @@ options(old) # make names(.SD) work - issue #795 dt = data.table(a = 1:4, b = 5:8) -test(2247.1, dt[, names(.SD) := lapply(.SD, '*', 2), .SDcols = 1L], data.table(a = 1:4 * 2, b = 5:8)) -test(2247.2, dt[, names(.SD) := lapply(.SD, '*', 2), .SDcols = 2L], data.table(a = 1:4 * 2, b = 5:8 * 2)) -test(2247.3, dt[, names(.SD) := lapply(.SD, as.integer)], data.table(a = as.integer(1:4 * 2), b = as.integer(5:8 * 2))) -test(2247.4, dt[1L, names(.SD) := lapply(.SD, '+', 2L)], data.table(a = as.integer(c(4, 2:4 * 2)), b = as.integer(c(12, 6:8 * 2)))) -test(2247.5, dt[, setdiff(names(.SD), 'a') := NULL], data.table(a = as.integer(c(4, 2:4 * 2)))) -test(2247.6, dt[, c(names(.SD)) := NULL], null.data.table()) +test(2247.01, dt[, names(.SD) := lapply(.SD, '*', 2), .SDcols = 1L], data.table(a = 1:4 * 2, b = 5:8)) +test(2247.02, dt[, names(.SD) := lapply(.SD, '*', 2), .SDcols = 2L], data.table(a = 1:4 * 2, b = 5:8 * 2)) +test(2247.03, dt[, names(.SD) := lapply(.SD, as.integer)], data.table(a = as.integer(1:4 * 2), b = as.integer(5:8 * 2))) +test(2247.04, dt[1L, names(.SD) := lapply(.SD, '+', 2L)], data.table(a = as.integer(c(4, 2:4 * 2)), b = as.integer(c(12, 6:8 * 2)))) +test(2247.05, dt[, setdiff(names(.SD), 'a') := NULL], data.table(a = as.integer(c(4, 2:4 * 2)))) +test(2247.06, dt[, c(names(.SD)) := NULL], null.data.table()) dt = data.table(a = 1:4, b = 5:8, grp = c('a', 'a', 'b', 'c')) -test(2247.7, dt[, names(.SD) := lapply(.SD, max), by = grp], data.table(a = c(2L, 2L, 3L, 4L), b = c(6L, 6L, 7L, 8L), grp = c('a', 'a', 'b', 'c'))) +test(2247.07, dt[, names(.SD) := lapply(.SD, max), by = grp], data.table(a = c(2L, 2L, 3L, 4L), b = c(6L, 6L, 7L, 8L), grp = c('a', 'a', 'b', 'c'))) dt = data.table(a = 1:4, b = 5:8, grp = c('a', 'a', 'b', 'c')) keep = c('a', 'b') -test(2247.8, dt[, names(.SD) := NULL, .SDcols = !keep], data.table(a = 1:4, b = 5:8)) +test(2247.08, dt[, names(.SD) := NULL, .SDcols = !keep], data.table(a = 1:4, b = 5:8)) dt = data.table(a = 1:4, b = 5:8, grp = c('a', 'a', 'b', 'c')) -test(2247.9, dt[, paste(names(.SD), 'max', sep = '_') := lapply(.SD, max), by = grp] , data.table(a = 1:4, b = 5:8, grp = c('a', 'a', 'b', 'c'))[, paste(c('a','b'), 'max', sep = '_') := lapply(.SD, max), by = grp]) +test(2247.09, dt[, paste(names(.SD), 'max', sep = '_') := lapply(.SD, max), by = grp] , data.table(a = 1:4, b = 5:8, grp = c('a', 'a', 'b', 'c'), a_max = c(2L, 2L, 3L, 4L), b_max = c(6L, 6L, 7L, 8L))) dt = data.table(a = 1:3, b = 5:7, grp = c('a', 'a', 'b')) -test(2247.91, dt[1:2, paste(names(.SD), 'max', sep = '_') := lapply(.SD, max), by = grp], data.table(a = 1:3, b = 5:7, grp = c('a', 'a', 'b'), a_max = c(2L, 2L, NA_integer_), b_max = c(6L, 6L, NA_integer_))) -test(2247.92, dt[, names(.SD(2)) := lapply(.SD, .I)], error = "could not find function \".SD\"") +test(2247.10, dt[1:2, paste(names(.SD), 'max', sep = '_') := lapply(.SD, max), by = grp], data.table(a = 1:3, b = 5:7, grp = c('a', 'a', 'b'), a_max = c(2L, 2L, NA_integer_), b_max = c(6L, 6L, NA_integer_))) +test(2247.11, dt[, names(.SD(2)) := lapply(.SD, .I)], error = 'could not find function ".SD"') diff --git a/vignettes/datatable-reference-semantics.Rmd b/vignettes/datatable-reference-semantics.Rmd index ab409bdda8..2de404788f 100644 --- a/vignettes/datatable-reference-semantics.Rmd +++ b/vignettes/datatable-reference-semantics.Rmd @@ -261,19 +261,20 @@ head(flights) #### -- How can we update multiple existing columns in place using `.SD`? ```{r} -char_cols <- sapply(flights, is.character) -flights[, names(.SD) := lapply(.SD, as.factor), .SDcols = char_cols] -str(flights[, ..char_cols]) +flights[, names(.SD) := lapply(.SD, as.factor), .SDcols = is.character] ``` -#### {.bs-callout .bs-callout-info} - -* We also could have used `(char_cols)` on the `LHS`. - Let's clean up again and make our newly made factor columns back to character columns. This time we will make use of `.SDcols` accepting a function to return columns. In this case, `is.factor` will return the columns which are factors. For more on the **S**ubset of the **D**ata, there is also an [SD Usage vignette](https://cran.r-project.org/web/packages/data.table/vignettes/datatable-sd-usage.html). + +Sometimes, it is also nice to keep track of columns that we transform. That way, even after we convert our columns we would be able to call the specific columns we were updating. ```{r} -flights[, names(.SD) := lapply(.SD, as.character), .SDcols = is.factor] -str(flights[, .SD, .SDcols = is.character]) +factor_cols <- sapply(flights, is.factor) +flights[, names(.SD) := lapply(.SD, as.character), .SDcols = factor_cols] +str(flights[, ..factor_cols]) ``` +#### {.bs-callout .bs-callout-info} + +* We also could have used `(factor_cols)` on the `LHS` instead of `names(.SD)`. + ## 3. `:=` and `copy()` `:=` modifies the input object by reference. Apart from the features we have discussed already, sometimes we might want to use the update by reference feature for its side effect. And at other times it may not be desirable to modify the original object, in which case we can use `copy()` function, as we will see in a moment. From 3635c3d4c30c7ef56b1e16bad4daef7ee14c27b1 Mon Sep 17 00:00:00 2001 From: Cole Miller Date: Mon, 18 Mar 2024 22:56:06 -0400 Subject: [PATCH 36/42] Added test where .SD is used as well as some columns not in .SD. --- inst/tests/tests.Rraw | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 8580ca22cb..0f8da1c7ff 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18318,3 +18318,7 @@ test(2247.09, dt[, paste(names(.SD), 'max', sep = '_') := lapply(.SD, max), by = dt = data.table(a = 1:3, b = 5:7, grp = c('a', 'a', 'b')) test(2247.10, dt[1:2, paste(names(.SD), 'max', sep = '_') := lapply(.SD, max), by = grp], data.table(a = 1:3, b = 5:7, grp = c('a', 'a', 'b'), a_max = c(2L, 2L, NA_integer_), b_max = c(6L, 6L, NA_integer_))) test(2247.11, dt[, names(.SD(2)) := lapply(.SD, .I)], error = 'could not find function ".SD"') + +dt = data.table(a = 1:3, b = 5:7, grp = c('a', 'a', 'b')) +test(2247.12, dt[, names(.SD) := lapply(.SD, \(x) x + b), .SDcols = "a"], data.table(a = 1:3 + 5:7, b = 5:7, grp = c('a', 'a', 'b'))) + From 5fec7bcb36ecd506f6aa5c6841ebe96badd92d21 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Mon, 18 Mar 2024 22:01:10 -0700 Subject: [PATCH 37/42] Mention count of reactions in issue --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 05a6743e54..e42be7e242 100644 --- a/NEWS.md +++ b/NEWS.md @@ -18,7 +18,7 @@ 3. Namespace-qualifying `data.table::shift()`, `data.table::first()`, or `data.table::last()` will not deactivate GForce, [#5942](https://github.com/Rdatatable/data.table/issues/5942). Thanks @MichaelChirico for the proposal and fix. Namespace-qualifying other calls like `stats::sum()`, `base::prod()`, etc., continue to work as an escape valve to avoid GForce, e.g. to ensure S3 method dispatch. -4. Using `dt[, names(.SD) := lapply(.SD, fx)]` now works, [#795](https://github.com/Rdatatable/data.table/issues/795) -- one of our [most-requested issues (see #3189)](https://github.com/Rdatatable/data.table/issues/3189). Thanks to @brodieG for the report and @ColeMiller1 for PR. +4. Using `dt[, names(.SD) := lapply(.SD, fx)]` now works, [#795](https://github.com/Rdatatable/data.table/issues/795) -- one of our [most-requested issues (see #3189)](https://github.com/Rdatatable/data.table/issues/3189). Thanks to @brodieG for the report, 20 or so others for chiming in, and @ColeMiller1 for PR. ## BUG FIXES From 7ae1ea3606f6a184ff3350076e2c834316cfd394 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Mon, 18 Mar 2024 22:20:51 -0700 Subject: [PATCH 38/42] small copy-edit --- vignettes/datatable-reference-semantics.Rmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vignettes/datatable-reference-semantics.Rmd b/vignettes/datatable-reference-semantics.Rmd index 2de404788f..b678c390ef 100644 --- a/vignettes/datatable-reference-semantics.Rmd +++ b/vignettes/datatable-reference-semantics.Rmd @@ -263,7 +263,7 @@ head(flights) ```{r} flights[, names(.SD) := lapply(.SD, as.factor), .SDcols = is.character] ``` -Let's clean up again and make our newly made factor columns back to character columns. This time we will make use of `.SDcols` accepting a function to return columns. In this case, `is.factor` will return the columns which are factors. For more on the **S**ubset of the **D**ata, there is also an [SD Usage vignette](https://cran.r-project.org/web/packages/data.table/vignettes/datatable-sd-usage.html). +Let's clean up again and convert our newly-made factor columns back into character columns. This time we will make use of `.SDcols` accepting a function to decide which columns to include. In this case, `is.factor()` will return the columns which are factors. For more on the **S**ubset of the **D**ata, there is also an [SD Usage vignette](https://cran.r-project.org/web/packages/data.table/vignettes/datatable-sd-usage.html). Sometimes, it is also nice to keep track of columns that we transform. That way, even after we convert our columns we would be able to call the specific columns we were updating. ```{r} From 2cb48ea2f551b703bc56d520dd11e8fa37767922 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Mon, 18 Mar 2024 22:22:28 -0700 Subject: [PATCH 39/42] more specific --- vignettes/datatable-sd-usage.Rmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vignettes/datatable-sd-usage.Rmd b/vignettes/datatable-sd-usage.Rmd index 97aea0f6c9..e4a117b35e 100644 --- a/vignettes/datatable-sd-usage.Rmd +++ b/vignettes/datatable-sd-usage.Rmd @@ -80,7 +80,7 @@ Pitching[ , .SD, .SDcols = c('W', 'L', 'G')] This is just for illustration and was pretty boring. In addition to accepting a character vector, `.SDcols` also accepts: 1. any function such as `is.character` to filter _columns_ -2. the function^{*} `patterns()` to filter by _column names_ +2. the function^{*} `patterns()` to filter _column names_ by regular expression 3. integer and logical vectors *see `?patterns` for more details From 5a587e71449bd69fc91a017fec220e06f249648a Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Mon, 18 Mar 2024 22:24:03 -0700 Subject: [PATCH 40/42] specify LHS/RHS --- vignettes/datatable-sd-usage.Rmd | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vignettes/datatable-sd-usage.Rmd b/vignettes/datatable-sd-usage.Rmd index e4a117b35e..09243c820f 100644 --- a/vignettes/datatable-sd-usage.Rmd +++ b/vignettes/datatable-sd-usage.Rmd @@ -113,8 +113,8 @@ head(unique(Teams[[fkt[1L]]])) Note: 1. The `:=` is an assignment operator to update the `data.table` in place without making a copy. See [reference semantics](https://cran.r-project.org/web/packages/data.table/vignettes/datatable-reference-semantics.html) for more. -2. `names(.SD)` indicates which columns we are updating - in this case we update the entire `.SD`. -3. `lapply()` loops through each column of the `.SD` and converts the column to a factor. +2. The LHS, `names(.SD)`, indicates which columns we are updating - in this case we update the entire `.SD`. +3. The RHS, `lapply()`, loops through each column of the `.SD` and converts the column to a factor. 4. We use the `.SDcols` to only select columns that have pattern of `teamID`. Again, the `.SDcols` argument is quite flexible; above, we supplied `patterns` but we could have also supplied `fkt` or any `character` vector of column names. In other situations, it is more convenient to supply an `integer` vector of column _positions_ or a `logical` vector dictating include/exclude for each column. Finally, the use of a function to filter columns is very helpful. From 212a77472aece55fd12f9ceaa9a3d451fb27295a Mon Sep 17 00:00:00 2001 From: Cole Miller Date: Tue, 19 Mar 2024 21:43:13 -0400 Subject: [PATCH 41/42] Simplify implementation to probe for names(.SD) and new test --- R/data.table.R | 8 +------- inst/tests/tests.Rraw | 4 ++++ 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index 45ec4a1924..0b080cf773 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1123,13 +1123,7 @@ replace_dot_alias = function(e) { lhs = as.character(lhs) } else { # i.e lhs is names(.SD) || setdiff(names(.SD), cols) || (cols) - replace_names_sd = function(e, cols){ - if (length(e) == 1L) return(e) - if (e %iscall% "names" && is.name(e2 <- e[[2L]]) && e2 == ".SD") return(cols) - for (i in 2:length(e)) if (!is.null(e[[i]])) e[[i]] = replace_names_sd(e[[i]], cols) - e - } - lhs = eval(replace_names_sd(lhs, sdvars), parent.frame(), parent.frame()) + lhs = eval(lhs, list(.SD = setNames(logical(length(sdvars)), sdvars)), parent.frame()) } } else { # `:=`(c2=1L,c3=2L,...) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 0f8da1c7ff..cfdde24cc6 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18322,3 +18322,7 @@ test(2247.11, dt[, names(.SD(2)) := lapply(.SD, .I)], error = 'could not find fu dt = data.table(a = 1:3, b = 5:7, grp = c('a', 'a', 'b')) test(2247.12, dt[, names(.SD) := lapply(.SD, \(x) x + b), .SDcols = "a"], data.table(a = 1:3 + 5:7, b = 5:7, grp = c('a', 'a', 'b'))) + +dt = data.table(a = 1L, b = 2L, c = 3L, d = 4L, e = 5L, f = 6L) +test(2247.13, dt[, names(.SD)[1:5] := sum(.SD)], data.table(a = 21L, b = 21L, c = 21L, d = 21L, e = 21L, f = 6L)) + From b91dab558a791598f0abd372973a4edae725fa12 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Tue, 19 Mar 2024 22:17:08 -0700 Subject: [PATCH 42/42] fine-tune comment --- R/data.table.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/data.table.R b/R/data.table.R index 0b080cf773..3d15485091 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1122,7 +1122,7 @@ replace_dot_alias = function(e) { if (is.name(lhs)) { lhs = as.character(lhs) } else { - # i.e lhs is names(.SD) || setdiff(names(.SD), cols) || (cols) + # lhs is e.g. (MyVar) or get("MyVar") or names(.SD) || setdiff(names(.SD), cols) lhs = eval(lhs, list(.SD = setNames(logical(length(sdvars)), sdvars)), parent.frame()) } } else {