From 4ed2f2d717e2dcbda3ded2a94da57265dae23647 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Tue, 17 Jul 2018 11:53:22 +0800 Subject: [PATCH 1/6] Closes #1596 -- CJ now auto-names its input --- NEWS.md | 2 ++ R/data.table.R | 31 ++++++++++--------------------- R/setkey.R | 7 +------ R/utils.R | 29 +++++++++++++++++++++++++++++ inst/tests/tests.Rraw | 4 ++-- 5 files changed, 44 insertions(+), 29 deletions(-) diff --git a/NEWS.md b/NEWS.md index ad95f1cb01..a556261cd3 100644 --- a/NEWS.md +++ b/NEWS.md @@ -5,6 +5,8 @@ #### NEW FEATURES +1. `CJ()` now auto-names its inputs exactly as `data.table()` does, [#1596](https://github.com/Rdatatable/data.table/issues/1596). Code that relies on `V1`, `V2`, ... naming will no longer work as expected. Thanks @franknarf1 for the suggestion. + #### BUG FIXES #### NOTES diff --git a/R/data.table.R b/R/data.table.R index 1f70c83d9c..66c8de24d1 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -49,31 +49,20 @@ data.table <-function(..., keep.rownames=FALSE, check.names=FALSE, key=NULL, str .Call(CcopyNamedInList,x) # to maintain pre-Rv3.1.0 behaviour, for now. See test 548.2. TODO: revist # TODO Something strange with NAMED on components of `...` to investigate. Or, just port data.table() to C. - # fix for #5377 - data.table(null list, data.frame and data.table) should return null data.table. Simple fix: check all scenarios here at the top. - if (identical(x, list(NULL)) || identical(x, list(list())) || - identical(x, list(data.frame(NULL))) || identical(x, list(data.table(NULL)))) return( null.data.table() ) - tt <- as.list(substitute(list(...)))[-1L] # Intention here is that data.table(X,Y) will automatically put X and Y as the column names. For longer expressions, name the arguments to data.table(). But in a call to [.data.table, wrap in list() e.g. DT[,list(a=mean(v),b=foobarzoo(zang))] will get the col names - vnames = names(tt) - if (is.null(vnames)) vnames = rep.int("",length(x)) - vnames[is.na(vnames)] = "" - novname = vnames=="" - if (any(!novname)) { - if (any(vnames[!novname] == ".SD")) stop("A column may not be called .SD. That has special meaning.") - } - for (i in which(novname)) { - # if (ncol(as.data.table(x[[i]])) <= 1) { # cbind call in test 230 fails if I write ncol(as.data.table(eval(tt[[i]], parent.frame()))) <= 1, no idea why... (keep this for later even though all tests pass with ncol(.).. because base uses as.data.frame(.)) - if (is.null(ncol(x[[i]]))) { - if ((tmp <- deparse(tt[[i]])[1L]) == make.names(tmp)) - vnames[i] <- tmp - } - } - tt = vnames=="" - if (any(tt)) vnames[tt] = paste0("V", which(tt)) - # so now finally we have good column names. We also will use novname later to know which were explicitly supplied in the call. n <- length(x) if (n < 1L) return( null.data.table() ) + # fix for #5377 - data.table(null list, data.frame and data.table) should return null data.table. Simple fix: check all scenarios here at the top. + if (identical(x, list(NULL)) || identical(x, list(list())) || + identical(x, list(data.frame(NULL))) || identical(x, list(data.table(NULL)))) return( null.data.table() ) + nd = name_dots(...) + vnames = nd$vnames + # We will use novname later to know which were explicitly supplied in the call. + novname = nd$novname if (length(vnames) != n) stop("logical error in vnames") + # cast to a list to facilitate naming of columns with dimension -- + # unlist() at the end automatically handles the need to "push" names + # to accommodate the "new" columns vnames <- as.list.default(vnames) nrows = integer(n) # vector of lengths of each column. may not be equal if silent repetition is required. numcols = integer(n) # the ncols of each of the inputs (e.g. if inputs contain matrix or data.table) diff --git a/R/setkey.R b/R/setkey.R index 27d70907d6..2d972388cc 100644 --- a/R/setkey.R +++ b/R/setkey.R @@ -390,13 +390,8 @@ CJ <- function(..., sorted = TRUE, unique = FALSE) } setattr(l, "row.names", .set_row_names(length(l[[1L]]))) setattr(l, "class", c("data.table", "data.frame")) + setattr(l, "names", name_dots(...)$vnames) - if (is.null(vnames <- names(l))) - vnames = vector("character", length(l)) - if (any(tt <- vnames == "")) { - vnames[tt] = paste0("V", which(tt)) - setattr(l, "names", vnames) - } l <- alloc.col(l) # a tiny bit wasteful to over-allocate a fixed join table (column slots only), doing it anyway for consistency, and it's possible a user may wish to use SJ directly outside a join and would expect consistent over-allocation. if (sorted) { if (!dups) setattr(l, 'sorted', names(l)) diff --git a/R/utils.R b/R/utils.R index 37e802fc92..f37dcb21ce 100644 --- a/R/utils.R +++ b/R/utils.R @@ -71,3 +71,32 @@ vapply_1i <- function (x, fun, ..., use.names = TRUE) { more = function(f) system(paste("more",f)) # nocov (just a dev helper) +# helper used to auto-name columns in data.table(x, y) as x, y (and similar) +# from the original comment in data.table.R: +# Intention here is that data.table(X,Y) will automatically put X and Y +# as the column names. For longer expressions, name the arguments to +# data.table(). But in a call to [.data.table, wrap in list() e.g. +# DT[,list(a=mean(v),b=foobarzoo(zang))] will get the col names +# naming of unnested matrices still handled by data.table() +name_dots <- function(...) { + dot_sub <- as.list(substitute(list(...)))[-1L] + vnames = names(dot_sub) + l = list(...) + if (is.null(vnames)) vnames = rep.int("", length(l)) + vnames[is.na(vnames)] = "" + novname = vnames=="" + if (any(idx <- !novname)) { + if (any(vnames[idx] == ".SD")) stop("A column may not be called .SD. That has special meaning.") + } + for (i in which(novname)) { + # if (ncol(as.data.table(x[[i]])) <= 1) { # cbind call in test 230 fails if I write ncol(as.data.table(eval(tt[[i]], parent.frame()))) <= 1, no idea why... (keep this for later even though all tests pass with ncol(.).. because base uses as.data.frame(.)) + if (is.null(ncol(l[[i]]))) { + if ((tmp <- deparse(dot_sub[[i]])[1L]) == make.names(tmp)) + vnames[i] <- tmp + } + } + still_empty = vnames=="" + if (any(still_empty)) vnames[still_empty] = paste0("V", which(still_empty)) + list(vnames = vnames, + novname = novname) +} diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 4384ab81b4..533db9ffba 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -2709,7 +2709,7 @@ test(995, DT[CJ(c(5,3), c(5,1), sorted=FALSE)], OUT) # CJ with ordered factor xx <- factor(letters[1:2], ordered=TRUE) yy <- sample(2) -test(996, CJ(xx, yy), setkey(data.table(rep(xx, each=2), rep(base::sort.int(yy), 2)))) +test(996, CJ(xx, yy), setkey(data.table(xx = rep(xx, each=2), yy = rep(base::sort.int(yy), 2)))) # That CJ orders NA consistently with setkey and historically, now it doesn't use setkey. # NA must always come first in data.table throughout, since binary search relies on that internally. @@ -6799,7 +6799,7 @@ test(1524, ans1, ans2) # 'unique =' argument for CJ, #1148 x = c(1, 2, 1) y = c(5, 8, 8, 4) -test(1525, CJ(x, y, unique=TRUE), CJ(c(1,2), c(4,5,8))) +test(1525, CJ(x, y, unique=TRUE), CJ(x = c(1,2), y = c(4,5,8))) # `key` argument fix for `setDT` when input is already a `data.table`, #1169 DT <- data.table(A = 1:4, B = 5:8) From 33269cd6c6d1c4f0cc56ee3cf03f37d3f260a25b Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Tue, 17 Jul 2018 14:34:16 +0800 Subject: [PATCH 2/6] update to CJ man, and also close #2192 --- man/J.Rd | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/man/J.Rd b/man/J.Rd index d860f502b9..0521e2e96e 100644 --- a/man/J.Rd +++ b/man/J.Rd @@ -16,7 +16,7 @@ CJ(..., sorted = TRUE, unique = FALSE) # DT[CJ(...)] \arguments{ \item{\dots}{ Each argument is a vector. Generally each vector is the same length but if they are not then the usual silent repetition is applied. } - \item{sorted}{ logical. Should the input order be retained?} + \item{sorted}{ logical. Should the input be sorted (ascending order)? If \code{FALSE}, the input order is retained. } \item{unique}{ logical. When \code{TRUE}, only unique values of each vectors are used (automatically). } } \details{ @@ -28,11 +28,11 @@ CJ(..., sorted = TRUE, unique = FALSE) # DT[CJ(...)] } \value{ \itemize{ - \code{J} : the same result as calling list. J is a direct alias for list but results in clearer more readable code. + \code{J} : the same result as calling list. \code{J} is a direct alias for list but results in clearer more readable code. - \code{SJ} : (S)orted (J)oin. The same value as J() but additionally setkey() is called on all the columns in the order they were passed in to SJ. For efficiency, to invoke a binary merge rather than a repeated binary full search for each row of \code{i}. + \code{SJ} : (S)orted (J)oin. The same value as \code{J()} but additionally \code{setkey()} is called on all the columns in the order they were passed in to \code{SJ}. For efficiency, to invoke a binary merge rather than a repeated binary full search for each row of \code{i}. - \code{CJ} : (C)ross (J)oin. A data.table is formed from the cross product of the vectors. For example, 10 ids, and 100 dates, CJ returns a 1000 row table containing all the dates for all the ids. It gains \code{sorted}, which by default is TRUE for backwards compatibility. FALSE retains input order. + \code{CJ} : (C)ross (J)oin. A \code{data.table} is formed from the cross product of the vectors. For example, 10 ids, and 100 dates, \code{CJ} returns a 1000 row table containing all the dates for all the ids. It gains \code{sorted}, which by default is \code{TRUE} for backwards compatibility. \code{FALSE} retains input order. } } \seealso{ \code{\link{data.table}}, \code{\link{test.data.table}} } @@ -50,6 +50,7 @@ CJ(c(5,NA,1), c(1,3,2), sorted=FALSE) # same order as input, unkeyed # use for 'unique=' argument x = c(1,1,2) y = c(4,6,4) +CJ(x, y) # output columns are automatically named 'x' and 'y' CJ(x, y, unique=TRUE) # unique(x) and unique(y) are computed automatically } From a53a7e95ed57f4e2d393a4c67f83a5e652198bd9 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Fri, 31 Aug 2018 18:51:12 -0700 Subject: [PATCH 3/6] Simpler name_dots --- R/utils.R | 33 +++++++++++++-------------------- 1 file changed, 13 insertions(+), 20 deletions(-) diff --git a/R/utils.R b/R/utils.R index d91bb3c4e4..09c0dec0d8 100644 --- a/R/utils.R +++ b/R/utils.R @@ -64,32 +64,25 @@ vapply_1i <- function (x, fun, ..., use.names = TRUE) { more = function(f) system(paste("more",f)) # nocov (just a dev helper) -# helper used to auto-name columns in data.table(x, y) as x, y (and similar) -# from the original comment in data.table.R: -# Intention here is that data.table(X,Y) will automatically put X and Y -# as the column names. For longer expressions, name the arguments to -# data.table(). But in a call to [.data.table, wrap in list() e.g. -# DT[,list(a=mean(v),b=foobarzoo(zang))] will get the col names -# naming of unnested matrices still handled by data.table() +# helper used to auto-name columns in data.table(x,y) as c("x","y"), CJ(x,y) and similar +# naming of unnested matrices still handled by data.table() name_dots <- function(...) { dot_sub <- as.list(substitute(list(...)))[-1L] vnames = names(dot_sub) - l = list(...) - if (is.null(vnames)) vnames = rep.int("", length(l)) - vnames[is.na(vnames)] = "" - novname = vnames=="" - if (any(idx <- !novname)) { - if (any(vnames[idx] == ".SD")) stop("A column may not be called .SD. That has special meaning.") + if (is.null(vnames)) { + vnames = rep.int("", length(dot_sub)) + novname = rep.int(TRUE, length(dot_sub)) + } else { + vnames[is.na(vnames)] = "" + if (any(vnames==".SD")) stop("A column may not be called .SD. That has special meaning.") + novname = vnames=="" } for (i in which(novname)) { - # if (ncol(as.data.table(x[[i]])) <= 1) { # cbind call in test 230 fails if I write ncol(as.data.table(eval(tt[[i]], parent.frame()))) <= 1, no idea why... (keep this for later even though all tests pass with ncol(.).. because base uses as.data.frame(.)) - if (is.null(ncol(l[[i]]))) { - if ((tmp <- deparse(dot_sub[[i]])[1L]) == make.names(tmp)) - vnames[i] <- tmp - } + if ((tmp <- deparse(dot_sub[[i]])[1L]) == make.names(tmp)) + vnames[i] = tmp } still_empty = vnames=="" if (any(still_empty)) vnames[still_empty] = paste0("V", which(still_empty)) - list(vnames = vnames, - novname = novname) + list(vnames=vnames, novname=novname) } + From f9028d996fb283a5b8f9f095d4aa7e537e6ea5c4 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Fri, 31 Aug 2018 19:28:08 -0700 Subject: [PATCH 4/6] Added datatable.CJ.names option --- NEWS.md | 2 +- R/setkey.R | 8 +++++++- inst/tests/tests.Rraw | 12 ++++++++++-- 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/NEWS.md b/NEWS.md index d06e86e669..9056efba5a 100644 --- a/NEWS.md +++ b/NEWS.md @@ -15,7 +15,7 @@ 5. `fread()` has always accepted system commands (e.g. `fread("grep blah file.txt")`). It now gains explicit `cmd=` too (`fread(cmd="grep blah file.txt")`). If and only if `input=` is a system command, and a variable was used to hold that command (`fread(someCommand)`, not `fread("the command")` which is a literal string constant) or a variable is used to construct it (`fread(paste("grep",variable,"file.txt"))`), a message is now printed suggesting `cmd=`. This is to inform all users that there is a potential security concern if you are i) creating apps, and ii) your app takes input from a public user who could be malicious, and iii) input from the malicious user (such as a filename) is passed by your app to `fread()`, and iv) your app in not running in a protected environment. If all 4 conditions hold then the malicious user could provide a system command instead of a filename which `fread()` would run and that would be a problem. If the app is not running in a protected environment (e.g. app is running as root) then this could do damage or obtain data you did not intend. Public facing apps should be running with limited operating system permission so that any breach from any source is contained. We agree with [Linus Torvald's advice](https://lkml.org/lkml/2017/11/21/356) on this which boils down to: "when addressing security concerns the first step is do no harm, just inform". If you aren't creating apps or apis that could have a malicious user then there is no risk but we can't distinguish you so we have to inform everyone. Messages can be suppressed with `suppressMessages()` or you can change to use `cmd=` at your leisure. Passing system commands to `fread()` continues to be recommended and encouraged and is widely used; e.g. via the techniques gathered together in the book [Data Science at the Command Line](https://www.datascienceatthecommandline.com/). A `warning()` is too strong because best-practice for production systems is to set `options(warn=2)` to tolerate no warnings. Such production systems have no user input and so there is no security risk; we don't want to do harm by breaking production systems via a `warning()` which gets turned into an error by `options(warn=2)`. Now that we have informed all users, we request feedback. There are 3 options for future releases: i) remove the message, ii) leave the message in place, iii) upgrade the message to warning and then eventually error. The default choice is the middle one: leave the message in place. -6. `CJ()` now auto-names its inputs exactly as `data.table()` does, [#1596](https://github.com/Rdatatable/data.table/issues/1596). Code that relies on `V1`, `V2`, ... naming will no longer work as expected. Thanks @franknarf1 for the suggestion. +6. New `options(datatable.CJ.names=TRUE)` changes `CJ()` to auto-name its inputs exactly as `data.table()` does, [#1596](https://github.com/Rdatatable/data.table/issues/1596). Thanks @franknarf1 for the suggestion. Current default is `FALSE`; i.e. no change. The option's default will be changed to `TRUE` in v1.12.0 and then eventually the option will be removed. Any code that depends on `CJ(x,y)$V1` will need to be changed to `CJ(x,y)$x` and is more akin to a bug fix due to the inconsistency with `data.table()`. #### BUG FIXES diff --git a/R/setkey.R b/R/setkey.R index 4ae7cacefc..4a5c19648a 100644 --- a/R/setkey.R +++ b/R/setkey.R @@ -396,7 +396,13 @@ CJ <- function(..., sorted = TRUE, unique = FALSE) } setattr(l, "row.names", .set_row_names(length(l[[1L]]))) setattr(l, "class", c("data.table", "data.frame")) - setattr(l, "names", name_dots(...)$vnames) + if (getOption("datatable.CJ.names", FALSE)) { # option added v1.11.6. Change to TRUE in v1.12.0 + vnames = name_dots(...)$vnames + } else { + if (is.null(vnames <- names(l))) vnames = paste0("V", seq_len(length(l))) + else if (any(tt <- vnames=="")) vnames[tt] = paste0("V", which(tt)) + } + setattr(l, "names", vnames) l <- alloc.col(l) # a tiny bit wasteful to over-allocate a fixed join table (column slots only), doing it anyway for consistency, and it's possible a user may wish to use SJ directly outside a join and would expect consistent over-allocation. if (sorted) { diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 7513e862e6..c561c7dae5 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -2712,7 +2712,11 @@ test(995, DT[CJ(c(5,3), c(5,1), sorted=FALSE)], OUT) # CJ with ordered factor xx <- factor(letters[1:2], ordered=TRUE) yy <- sample(2) -test(996, CJ(xx, yy), setkey(data.table(xx = rep(xx, each=2), yy = rep(base::sort.int(yy), 2)))) +old = options(datatable.CJ.names=FALSE) +test(996.1, CJ(xx, yy), setkey(data.table(rep(xx, each=2), rep(base::sort.int(yy), 2)))) +options(datatable.CJ.names=TRUE) +test(996.2, CJ(xx, yy), setkey(data.table(xx = rep(xx, each=2), yy = rep(base::sort.int(yy), 2)))) +options(old) # That CJ orders NA consistently with setkey and historically, now it doesn't use setkey. # NA must always come first in data.table throughout, since binary search relies on that internally. @@ -6802,7 +6806,11 @@ test(1524, ans1, ans2) # 'unique =' argument for CJ, #1148 x = c(1, 2, 1) y = c(5, 8, 8, 4) -test(1525, CJ(x, y, unique=TRUE), CJ(x = c(1,2), y = c(4,5,8))) +old = options(datatable.CJ.names=FALSE) +test(1525.1, CJ(x, y, unique=TRUE), CJ(V1=c(1,2), V2=c(4,5,8))) +options(datatable.CJ.names=TRUE) +test(1525.1, CJ(x, y, unique=TRUE), CJ( x=c(1,2), y=c(4,5,8))) +options(old) # `key` argument fix for `setDT` when input is already a `data.table`, #1169 DT <- data.table(A = 1:4, B = 5:8) From 176ff5dc13bf13e54fd4230613aa1ddb7ccca359 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Fri, 31 Aug 2018 19:30:20 -0700 Subject: [PATCH 5/6] Fixed test number --- inst/tests/tests.Rraw | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index c561c7dae5..72c4ad6b48 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -6809,7 +6809,7 @@ y = c(5, 8, 8, 4) old = options(datatable.CJ.names=FALSE) test(1525.1, CJ(x, y, unique=TRUE), CJ(V1=c(1,2), V2=c(4,5,8))) options(datatable.CJ.names=TRUE) -test(1525.1, CJ(x, y, unique=TRUE), CJ( x=c(1,2), y=c(4,5,8))) +test(1525.2, CJ(x, y, unique=TRUE), CJ( x=c(1,2), y=c(4,5,8))) options(old) # `key` argument fix for `setDT` when input is already a `data.table`, #1169 From 8583e5b079cdba30cf0b625d750348be4eb82b2e Mon Sep 17 00:00:00 2001 From: mattdowle Date: Fri, 31 Aug 2018 19:50:27 -0700 Subject: [PATCH 6/6] coverage --- R/data.table.R | 2 +- inst/tests/tests.Rraw | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/R/data.table.R b/R/data.table.R index 0b307b9c6f..6a7a6e0dd4 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -59,7 +59,7 @@ data.table <-function(..., keep.rownames=FALSE, check.names=FALSE, key=NULL, str vnames = nd$vnames # We will use novname later to know which were explicitly supplied in the call. novname = nd$novname - if (length(vnames) != n) stop("logical error in vnames") + if (length(vnames) != n) stop("logical error in vnames") # nocov # cast to a list to facilitate naming of columns with dimension -- # unlist() at the end automatically handles the need to "push" names # to accommodate the "new" columns diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 72c4ad6b48..4a549a4e57 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -12153,6 +12153,9 @@ test(1938.2, fwrite(list(1:3), file=f<-tempfile()), NULL) # just adding file= w test(1939.3, readLines(f), as.character(1:3)) unlink(f) +test(1940, data.table(A=1:3, .SD=4:6), error=".SD.*has special meaning") + + ################################### # Add new tests above this line # ###################################