diff --git a/DESCRIPTION b/DESCRIPTION index 778024a85e..53f11e9d04 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -89,5 +89,6 @@ Authors@R: c( person("Iago", "Giné-Vázquez", role="ctb"), person("Anirban", "Chetia", role="ctb"), person("Doris", "Amoakohene", role="ctb"), - person("Ivan", "Krylov", role="ctb") + person("Ivan", "Krylov", role="ctb"), + person("Michael","Young", role="ctb") ) diff --git a/NEWS.md b/NEWS.md index 6e5a3a9f14..652135536f 100644 --- a/NEWS.md +++ b/NEWS.md @@ -33,6 +33,26 @@ rowwiseDT( 3. The data.table-only attribute `$.internal.selfref` is no longer set for data.frames. [#5286](https://github.com/Rdatatable/data.table/issues/5286). Thanks @OfekShilon for the report and fix. +4. Tagging/naming arguments of `c()` in `j=c()` should now more closely follow base R conventions for concatenation of named lists during grouping, [#2311](https://github.com/Rdatatable/data.table/issues/2311). Naming an `lapply(.SD, FUN)` call as an argument of `c()` in `j` will now always cause that tag to get prepended (with a single dot separator) to the resulting column names. Additionally, naming a `list()` call as an argument of `c()` in `j` will now always cause that tag to get prepended to any names specified within the list call. This bug only affected queries with (1) `by=` grouping (2) `getOption("datatable.optimize") >= 1L` and (3) `lapply(.SD, FUN)` in `j`. + + While the names returned by `data.table` when `j=c()` will now mostly follow base R conventions for concatenating lists, note that names which are completely unspecified will still be named positionally, matching the typical behavior in `j` and `data.table()`. according to position in `j` (e.g. `V1`, `V2`). + + Thanks to @franknarf1 for reporting and @myoung3 for the PR. + + ```r + # tag 'mean' prepended to lapply()-named columns + names(mtcars[, c(mean=lapply(.SD,sum)), by="cyl", .SDcols=c("am", "carb")]) + # [1] "cyl" "mean.am" "mean.carb" + + # tag 'mean' is prepended to the first named sublist, 'sum' to the second + names(mtcars[, c(mean=list(a=mean(hp), b=mean(wt)), sum=lapply(.SD, sum)), by="cyl", .SDcols=c("am", "carb")]) + # [1] "cyl" "mean.a" "mean.b" "sum.am" "sum.carb" + + # strict base naming would result in names c("", "b", "c") here + names(mtcars[, c(list(mean(hp), b=mean(wt)), c=list(mean(cyl)))]) + # [1] "V1" "b" "c" + ``` + ## NOTES 1. Tests run again when some Suggests packages are missing, [#6411](https://github.com/Rdatatable/data.table/issues/6411). Thanks @aadler for the note and @MichaelChirico for the fix. diff --git a/R/data.table.R b/R/data.table.R index 3b58bb5025..20deea3f9f 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1697,12 +1697,30 @@ replace_dot_alias = function(e) { deparse_ans = .massageSD(this) funi = funi + 1L # Fix for #985 jsubl[[i_]] = as.list(deparse_ans[[1L]][-1L]) # just keep the '.' from list(.) - jvnames = c(jvnames, deparse_ans[[2L]]) + jn__ = deparse_ans[[2L]] + if (isTRUE(nzchar(names(jsubl)[i_]))) { + # Fix for #2311, prepend named arguments of c() to column names of .SD + # e.g. c(mean=lapply(.SD, mean)) + jn__ = paste(names(jsubl)[i_], jn__, sep=".") # sep="." for consistency with c(A=list(a=1,b=1)) + } + jvnames = c(jvnames, jn__) } else if (this[[1L]] == "list") { # also handle c(lapply(.SD, sum), list()) - silly, yes, but can happen if (length(this) > 1L) { jl__ = as.list(jsubl[[i_]])[-1L] # just keep the '.' from list(.) - jn__ = if (is.null(names(jl__))) rep("", length(jl__)) else names(jl__) + if (isTRUE(nzchar(names(jsubl)[i_]))) { + # Fix for #2311, prepend named list arguments of c() to that list's names. See tests 2283.* + njl__ = if (is.null(names(jl__))) rep("", length(jl__)) else names(jl__) + njl__nonblank = nzchar(names(jl__)) + if (length(jl__) > 1L) { + jn__ = paste0(names(jsubl)[i_], seq_along(jl__)) + } else { + jn__ = names(jsubl)[i_] + } + jn__[njl__nonblank] = paste(names(jsubl)[i_], njl__[njl__nonblank], sep=".") + } else { + jn__ = if (is.null(names(jl__))) rep("", length(jl__)) else names(jl__) + } idx = unlist(lapply(jl__, function(x) is.name(x) && x == ".I")) if (any(idx)) jn__[idx & !nzchar(jn__)] = "I" # this & is correct not && @@ -1967,7 +1985,10 @@ replace_dot_alias = function(e) { if (any(ww)) jvnames[ww] = paste0("V",ww) setattr(ans, "names", c(bynames, jvnames)) } else { - setnames(ans,seq_along(bynames),bynames) # TO DO: reinvestigate bynames flowing from dogroups here and simplify + nonbynames = names(ans)[-seq_along(bynames)] #related to 2311. make naming of empty columns names more consistent + ww = which(!nzchar(nonbynames)) + if (length(ww)) nonbynames[ww] = paste0("V", ww) + setattr(ans, "names", c(bynames, nonbynames)) # TO DO: reinvestigate bynames flowing from dogroups here and simplify } if (byjoin && keyby && !bysameorder) { if (verbose) {last.started.at=proc.time();catf("setkey() afterwards for keyby=.EACHI ... ");flush.console()} diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 9f8bf166e7..460dbf6f8b 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -19085,3 +19085,73 @@ test(2282.08, rowwiseDT(A=,B=,1,2,C=,4), error="Header must be the first N argum # evaluate arguments in the correct frame ncols = 1e6 test(2282.09, rowwiseDT(A=,ncols), data.table(A=ncols)) + +# named arguments of c() in j get prepended to lapply(.SD, FUN) #2311 + +M <- as.data.table(mtcars) +M[, " " := hp] +M[, "." := hp] + +sdnames <- setdiff(names(M), "cyl") +sdlist <- vector("list", length(sdnames)) +names(sdlist) <- sdnames + +for (opt in c(0, 1, 2)) { + test(2283 + opt/10 + 0.001, options=c(datatable.optimize=opt), + names(M[, c(m=lapply(.SD, mean)), by="cyl"]), + c("cyl", names(c(m=sdlist)))) + test(2283 + opt/10 + 0.002, options=c(datatable.optimize=opt), + names(M[, c(Mpg=list(mpg), lapply(.SD, mean)), by="cyl"]), + c("cyl", "Mpg", sdnames)) + test(2283 + opt/10 + 0.003, options=c(datatable.optimize=opt), + names(M[, c(Mpg=list(mpg), m=lapply(.SD, mean)), by="cyl"]), + c("cyl", "Mpg", names(c(m=sdlist)))) + test(2283 + opt/10 + 0.004, options=c(datatable.optimize=opt), + names(M[, c(mpg=list(mpg), mpg=lapply(.SD, mean)), by="cyl"]), + c("cyl", "mpg", names(c(mpg=sdlist)))) + test(2283 + opt/10 + 0.005, options=c(datatable.optimize=opt), + names(M[, c(list(mpg), lapply(.SD, mean)), by="cyl"]), + c("cyl", "V1", sdnames)) + test(2283 + opt/10 + 0.006, options=c(datatable.optimize=opt), + names(M[, c(lapply(.SD, mean), list(mpg)), by="cyl"]), + c("cyl", sdnames, sprintf("V%d", length(sdnames)+1L))) + test(2283 + opt/10 + 0.007, options=c(datatable.optimize=opt), + names(M[, c(lapply(.SD, mean), lapply(.SD, sum)), by="cyl"]), + c("cyl", sdnames, sdnames)) + test(2283 + opt/10 + 0.008, options=c(datatable.optimize=opt), + names(M[, c(mean=lapply(.SD, mean), sum=lapply(.SD, sum)), by="cyl"]), + c("cyl", names(c(mean=sdlist, sum=sdlist)))) + test(2283 + opt/10 + 0.009, options=c(datatable.optimize=opt), + names(M[, c(lapply(.SD, mean), sum=lapply(.SD, sum)), by="cyl"]), + c("cyl", sdnames, names(c(sum=sdlist))) ) + test(2283 + opt/10 + 0.010, options=c(datatable.optimize=opt), + names(M[, c(" "=lapply(.SD, mean), "."=lapply(.SD, sum)), by="cyl"]), + c("cyl", names(c(" "=sdlist, "."=sdlist)))) + test(2283 + opt/10 + 0.011, options=c(datatable.optimize=opt), + names(M[, c(A=list(a=mpg, b=hp), lapply(.SD, mean)), by="cyl"]), + c("cyl", names(c(A=list(a=0, b=0))), sdnames)) + test(2283 + opt/10 + 0.012, options=c(datatable.optimize=opt), + names(M[, c(A=list(mpg, hp), lapply(.SD, mean)), by="cyl"]), + c("cyl", names(c(A=list(0, 0))), sdnames)) + test(2283 + opt/10 + 0.013, options=c(datatable.optimize=opt), + names(M[, c(A=list(mpg, b=hp, wt), lapply(.SD, mean)), by="cyl"]), + c("cyl", names(c(A=list(0, b=0, 0))), sdnames)) + test(2283 + opt/10 + 0.014, options=c(datatable.optimize=opt), + names(M[, c(A=list(mpg), lapply(.SD, mean)), by="cyl"]), + c("cyl", names(c(A=list(0))), sdnames)) + test(2283 + opt/10 + 0.015, options=c(datatable.optimize=opt), + names(M[, c(" "=list(" "=hp, "."=disp, mpg), lapply(.SD, mean)), by="cyl"]), + c("cyl", names(c(" "=list(" "=0, "."=0, 0))), sdnames)) + test(2283 + opt/10 + 0.016, options=c(datatable.optimize=opt), + names(M[, c("."=list(" "=hp, "."=disp, mpg), lapply(.SD, mean)), by="cyl"]), + c("cyl", names(c("."=list(" "=0, "."=0, 0))), sdnames)) + test(2283 + opt/10 + 0.017, options=c(datatable.optimize=opt), + names(M[, c(list(mpg, b=hp), lapply(.SD, mean)), by="cyl", .SDcols=c("vs", "am")]), + c("cyl", "V1", "b", "vs", "am")) + test(2283 + opt/10 + 0.018, options=c(datatable.optimize=opt), + names(M[, c(list(mpg, b=hp), c(lapply(.SD, mean))), by="cyl", .SDcols=c("vs", "am")]), + c("cyl", "V1", "b", "vs", "am")) + test(2283 + opt/10 + 0.019, options=c(datatable.optimize=opt), + names(M[, c(mpg[1], list(mpg, b=hp), c(lapply(.SD, mean))), by="cyl", .SDcols=c("vs", "am")]), + c("cyl", "V1", "V2", "b", "vs", "am")) +}