diff --git a/NEWS.md b/NEWS.md index 7bfd9a416a..3218b31df8 100644 --- a/NEWS.md +++ b/NEWS.md @@ -6,6 +6,8 @@ 1. Usage of comma-separated character strings representing multiple columns in `data.table()`'s `key=` argument and `[`'s `by=`/`keyby=` arguments is deprecated, [#4357](https://github.com/Rdatatable/data.table/issues/4357). While sometimes convenient, ultimately it introduces inconsistency in implementation that is not worth the benefit to maintain. NB: this hard deprecation is temporary in the development version. Before release, it will soften into the normal data.table deprecation cycle starting from introducing the new behavior with an option, then changing the default for the option with a warning, then upgrading the warning to an error before finally removing the option and the error. +2. `measure()` and `patterns()` no longer allow `cols` argument to be provided by the user, in the context of `.SDcols` or `melt`. Documentation and error messages also now more clearly explain this, [#5063](https://github.com/Rdatatable/data.table/issues/5063). Thanks @UweBlock for the report and to @tdhock for fixing. + ## NEW FEATURES 1. `print.data.table()` shows empty (`NULL`) list column entries as `[NULL]` for emphasis. Previously they would just print nothing (same as for empty string). Part of [#4198](https://github.com/Rdatatable/data.table/issues/4198). Thanks @sritchie73 for the proposal and fix. diff --git a/R/utils.R b/R/utils.R index a78e5450f7..ecc36e9a43 100644 --- a/R/utils.R +++ b/R/utils.R @@ -132,7 +132,10 @@ eval_with_cols = function(orig_call, all_cols) { }) if (!is.primitive(fun)) { named_call = match.call(fun, orig_call) - if ("cols" %in% names(formals(fun)) && !"cols" %in% names(named_call)) { + if ("cols" %in% names(formals(fun))) { + if ("cols" %in% names(named_call)) { + stopf("user should not provide cols argument to %s, when specifying the columns for melt or .SDcols; in this context, non-standard evaluation is used internally to set cols to all data table column names, so please fix by removing cols argument", as.character(fun_uneval)) + } named_call[["cols"]] = all_cols } named_call[[1L]] = fun diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 5f6206c42c..d9746eea59 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -50,6 +50,7 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { isReallyReal = data.table:::isReallyReal isRealReallyInt = data.table:::isRealReallyInt is_utc = data.table:::is_utc + measure = data.table:::measure # for test 2183.001 melt.data.table = data.table:::melt.data.table # for test 1953.4 null.data.table = data.table:::null.data.table print.data.table = data.table:::print.data.table @@ -12355,7 +12356,7 @@ DTout = data.table( value1 = 1:10, value2 = c("a", "b", "c", "d", "e", "f", "g", "h", "i", "j") ) -test(1866.6, melt(DT, measure.vars = patterns("^x", "^y", cols=names(DT))), DTout) +test(1866.6, melt(DT, measure.vars = patterns("^x", "^y")), DTout) # auto fill too few column names (#1625) and auto fill=TRUE when too many column names test(1867.01, fread("A,B\n1,3,5,7\n2,4,6,8\n"), data.table(A=1:2, B=3:4, V3=5:6, V4=7:8), @@ -13999,6 +14000,7 @@ DT = data.table( V9 = c(0.2, -0.1, 1.2, -0.5, 1.4, 1, 0.2, 0.7, 0.4, 1.6), V10 = c(0.8, 0.7, -1.2, -0.9, -0.6, 0.4, -2.3, 2.2, 0.5, -1.4) ) +test(1971.01, DT[ , lapply(.SD, sum), .SDcols = patterns('^V', cols="V7")], error="user should not provide cols argument to patterns, when specifying the columns for melt or .SDcols; in this context, non-standard evaluation is used internally to set cols to all data table column names, so please fix by removing cols argument") test(1971.1, DT[ , lapply(.SD, sum), .SDcols = patterns('^V')], data.table(V1=-6.3, V2=-6.5, V3=1.3, V4=3.4, V5=-0.9, V6=-1.6, V7=-2, V8=-0.4, V9=6.1, V10=-1.8)) # multiple pattens --> intersection of patterns @@ -17298,12 +17300,13 @@ test(2182.75, melt(data.table(a=10, b=20), measure.vars=list(n="a"), variable.fa # new variable_table attribute for measure.vars, PR#4731 for multiple issues measurev = function(cols)cols # user-defined function for computing measure.vars, same name as data.table::measure but user-defined version should be used. test(2183.00001, melt(DT.wide, measure.vars=measurev()), data.table(variable=factor(c("a2","b1","b2")), value=c(2,1,2))) +test(2183.000015, melt(DT.wide, measure.vars=measurev("a2")), error="user should not provide cols argument to measurev, when specifying the columns for melt or .SDcols; in this context, non-standard evaluation is used internally to set cols to all data table column names, so please fix by removing cols argument") measurev = list("foo", "bar")#measurev below should not use this since it is not a function. test(2183.00002, melt(DTid, measure.vars=measurev(list(value.name=NULL, num=as.complex), pattern="([ab])([12])")), error="Type 'complex' is not supported for joining/merging") test(2183.00004, melt(DTid, measure.vars=measurev(list(value.name=NULL, istr=NULL), pattern="([ab])([12])"))[order(b)], data.table(id=1, istr=paste(c(1,2)), a=c(NA, 2), b=c(1,2))) test(2183.00005, melt(DTid, measure.vars=measurev(list(column=NULL, istr=NULL), pattern="([ab])([12])", multiple.keyword="column"))[order(b)], data.table(id=1, istr=paste(c(1,2)), a=c(NA, 2), b=c(1,2)))#same computation but different multiple.keyword iris.dt = data.table(datasets::iris) -test(2183.00020, melt(iris.dt, measure.vars=measurev(value.name, dim, sep=".", pattern="foo")), error="both sep and pattern arguments used; must use either sep or pattern (not both)") +test(2183.00020, melt(iris.dt, measure.vars=measurev(list(value.name=NULL, dim=NULL), sep=".", pattern="foo")), error="both sep and pattern arguments used; must use either sep or pattern (not both)") test(2183.000201, melt(iris.dt, measure.vars=measurev(list(NULL, dim=NULL), sep=".")), error="in measurev, elements of fun.list must be named, problems: [1]") test(2183.000202, melt(iris.dt, measure.vars=measurev(list(NULL, NULL), sep=".")), error="in measurev, elements of fun.list must be named, problems: [1, 2]") test(2183.00027, melt(iris.dt, measure.vars=measurev(list(value.name=NULL, dim="bar"), sep=".")), error="in the measurev fun.list, each non-NULL element must be a function with at least one argument, problem: dim") @@ -17319,8 +17322,10 @@ test(2183.00060, melt(DTid, measure.vars=measurev(list(letter=myfac, value.name= ### Second block testing measure # new variable_table attribute for measure.vars, PR#4731 for multiple issues +test(2183.001, measure(cols=as.integer), error="cols must be a character vector of column names") measure = function(cols)cols # user-defined function for computing measure.vars, same name as data.table::measure but user-defined version should be used. test(2183.01, melt(DT.wide, measure.vars=measure()), data.table(variable=factor(c("a2","b1","b2")), value=c(2,1,2))) +test(2183.015, melt(DT.wide, measure.vars=measure(cols="b2")), error="user should not provide cols argument to measure, when specifying the columns for melt or .SDcols; in this context, non-standard evaluation is used internally to set cols to all data table column names, so please fix by removing cols argument") measure = list("foo", "bar")#measure below should not use this since it is not a function. test(2183.02, melt(DTid, measure.vars=measure(value.name, num=as.complex, pattern="([ab])([12])")), error="Type 'complex' is not supported for joining/merging") test(2183.03, melt(DTid, measure.vars=structure(list(a=c(NA,"a2"),b=c("b1","b2")), variable_table=data.table(number=as.complex(1:2)))), error="variable_table does not support column type 'complex' for column 'number'") @@ -17392,7 +17397,7 @@ test(2183.74, melt(DTid, measure.vars=measure(letter, number, multiple.keyword=a test(2183.75, melt(DTid, measure.vars=measure(letter, number, multiple.keyword=NA_character_, pattern="([ab])([12])")), error="multiple.keyword must be a character string") test(2183.76, melt(DTid, measure.vars=measure(letter, number, multiple.keyword="", pattern="([ab])([12])")), error="multiple.keyword must be a character string with nchar>0") test(2183.77, melt(DTid, measure.vars=measure(letter, cols, pattern="([ab])([12])")), error="group names specified in ... conflict with measure argument names; please fix by changing group names: [cols]") -test(2183.78, melt(DTid, measure.vars=measure(letter, cols=as.integer, pattern="([ab])([12])")), error="cols must be a character vector of column names") +test(2183.78, melt(DTid, measure.vars=measure(letter, cols=as.integer, pattern="([ab])([12])")), error="user should not provide cols argument to measure, when specifying the columns for melt or .SDcols; in this context, non-standard evaluation is used internally to set cols to all data table column names, so please fix by removing cols argument") test(2183.79, melt(DTid, measure.vars=measure(letter, number, pattern=as.integer)), error="pattern must be character string") test(2183.80, melt(DTid, measure.vars=measure(letter, number, sep=as.integer)), error="sep must be character string") diff --git a/man/measure.Rd b/man/measure.Rd index 73a315e006..d230ac7478 100644 --- a/man/measure.Rd +++ b/man/measure.Rd @@ -5,16 +5,15 @@ \description{ These functions compute an integer vector or list for use as the \code{measure.vars} argument to \code{melt}. - Each measured variable name is converted into several groups that occupy + Either \code{sep} or \code{pattern} argument can be used to specify + a subset of input column/variable names to be measured. + Each measured variable name is converted into one or more groups that occupy different columns in the output melted data. \code{measure} allows specifying group names/conversions in R code (each group and conversion specified as an argument) whereas \code{measurev} allows specifying group names/conversions using data values (each group and conversion specified as a list element). - See - \href{../doc/datatable-reshape.html}{\code{vignette("datatable-reshape")}} - for more info. } \usage{ measure(\dots, sep, pattern, cols, multiple.keyword="value.name") @@ -38,7 +37,15 @@ measurev(fun.list, sep, pattern, cols, multiple.keyword="value.name", are considered measure variables.} \item{pattern}{Perl-compatible regex with capture groups to match to \code{cols}. Columns that match the regex are considered measure variables.} - \item{cols}{A character vector of column names.} + \item{cols}{ + A character vector of column names. + When used in the context of + \code{melt(measure.vars=measure())}, the + user should not provide the \code{cols} argument, which is + automatically set to all of the column names of the input data + table, using non-standard evaluation. + To specify the columns to be measured, + use either \code{sep} or \code{pattern} argument.} \item{multiple.keyword}{A string, if used as a group name, then measure returns a list and melt returns multiple value columns (with names defined by the unique values in that @@ -54,6 +61,8 @@ measurev(fun.list, sep, pattern, cols, multiple.keyword="value.name", (two.iris = data.table(datasets::iris)[c(1,150)]) # melt into a single value column. melt(two.iris, measure.vars = measure(part, dim, sep=".")) +# do the same, with a regex pattern. +melt(two.iris, measure.vars = measure(part, dim, pattern="(.*)[.](.*)")) # do the same, programmatically with measurev my.list = list(part=NULL, dim=NULL) melt(two.iris, measure.vars=measurev(my.list, sep=".")) diff --git a/man/melt.data.table.Rd b/man/melt.data.table.Rd index 6dd74291d5..8ec6202513 100644 --- a/man/melt.data.table.Rd +++ b/man/melt.data.table.Rd @@ -21,16 +21,16 @@ multiple columns simultaneously. \item{id.vars}{vector of id variables. Can be integer (corresponding id column numbers) or character (id column names) vector. If missing, all non-measure columns will be assigned to it. If integer, must be positive; see Details. } -\item{measure.vars}{Measure variables for \code{melt}ing. Can be missing, vector, list, or pattern-based. +\item{measure.vars}{Measure variables for \code{melt}ing. Can be missing, vector, list, or a function can be called. \itemize{ \item{ When missing, \code{measure.vars} will become all columns outside \code{id.vars}. } \item{ Vector can be \code{integer} (implying column numbers) or \code{character} (column names). } \item{ \code{list} is a generalization of the vector version -- each element of the list (which should be \code{integer} or \code{character} as above) will become a \code{melt}ed column. } - \item{ Pattern-based column matching can be achieved with the regular expression-based \code{\link{patterns}} syntax; multiple patterns will produce multiple columns. } + \item{ Pattern-based column matching can be achieved by using the result of \code{\link{patterns}} or \code{\link{measure}} functions. More generally, it is possible to call any function that returns a character vector or list of measure variables, and if this function has an argument named \code{cols}, then that argument will be set to \code{names(data)} by non-standard evaluation. } } - For convenience/clarity in the case of multiple \code{melt}ed columns, resulting column names can be supplied as names to the elements \code{measure.vars} (in the \code{list} and \code{patterns} usages). See also \code{Examples}. } + For convenience/clarity in the case of multiple \code{melt}ed columns, resulting column names can be supplied as names to the elements \code{measure.vars} (in the \code{list} and \code{patterns} usages). See also Examples below. } \item{variable.name}{name (default \code{'variable'}) of output column containing information about which input column(s) were melted. If \code{measure.vars} is an integer/character vector, then each entry of this column contains the name of a melted column from \code{data}. If \code{measure.vars} is a list of integer/character vectors, then each entry of this column contains an integer indicating an index/position in each of those vectors. If \code{measure.vars} has attribute \code{variable_table} then it must be a data table with nrow = length of \code{measure.vars} vector(s), each row describing the corresponding measured variables(s), (typically created via \code{measure}) and its columns will be output instead of the \code{variable.name} column.} \item{value.name}{name for the molten data values column(s). The default name is \code{'value'}. Multiple names can be provided here for the case when \code{measure.vars} is a \code{list}, though note well that the names provided in \code{measure.vars} take precedence. } \item{na.rm}{If \code{TRUE}, \code{NA} values will be removed from the molten @@ -66,10 +66,8 @@ into multiple columns in a single function call efficiently. If a vector in the list contains missing values, or is shorter than the max length of the list elements, then the output will include runs of missing values at the specified position, or at the end. -The function -\code{\link{patterns}} can be used to provide regular expression patterns. When -used along with \code{melt}, if \code{cols} argument is not provided, the -patterns will be matched against \code{names(data)}, for convenience. +The functions \code{\link{patterns}} and \code{\link{measure}} +can be used in \code{measure.vars} to melt all columns matching a regular expression. Attributes are preserved if all \code{value} columns are of the same type. By default, if any of the columns to be melted are of type \code{factor}, it'll diff --git a/man/patterns.Rd b/man/patterns.Rd index cd3d3fd8bb..2181167a53 100644 --- a/man/patterns.Rd +++ b/man/patterns.Rd @@ -19,19 +19,27 @@ patterns( } \arguments{ \item{\dots}{A set of regular expression patterns.} - \item{cols}{A character vector of names to which each pattern is matched.} + \item{cols}{A character vector of column names to match with + patterns. When used in the context of + \code{melt(measure.vars=patterns())} or + \code{DT[, .SDcols=patterns()]}, the + user should not provide the \code{cols} argument, which is + automatically set to all of the column names of the input data + table, using non-standard evaluation. + } \item{ignore.case, perl, fixed, useBytes}{Passed to \code{\link{grep}}.} } \seealso{ - \code{\link{melt}}, - \url{https://github.com/Rdatatable/data.table/wiki/Getting-started} + \code{\link{melt}}, \code{\link{.SD}}, + \url{https://github.com/Rdatatable/data.table/wiki/Getting-started}, + [`vignette("datatable-reshape")`](https://cloud.r-project.org/web/packages/data.table/vignettes/datatable-reshape.html), + [`vignette("datatable-sd-usage")`](https://cloud.r-project.org/web/packages/data.table/vignettes/datatable-sd-usage.html) } \examples{ DT = data.table(x1 = 1:5, x2 = 6:10, y1 = letters[1:5], y2 = letters[6:10]) # melt all columns that begin with 'x' & 'y', respectively, into separate columns -melt(DT, measure.vars = patterns("^x", "^y", cols=names(DT))) -# when used with melt, 'cols' is implicitly assumed to be names of input -# data.table, if not provided. -melt(DT, measure.vars = patterns("^x", "^y")) +melt(DT, measure.vars = patterns(x="^x", y="^y")) +# summarize all columns that contain x +DT[, lapply(.SD, sum), .SDcols=patterns("x")] } \keyword{data}