From 1561017c75431662cb53fe1451c5782f4f7cc241 Mon Sep 17 00:00:00 2001 From: mczekanski1 Date: Sat, 6 Mar 2021 19:14:35 -0500 Subject: [PATCH 01/32] Added notin operator to resolve #4152 --- NAMESPACE | 2 +- R/notin.R | 6 ++++++ inst/tests/tests.Rraw | 20 ++++++++++++-------- man/notin.Rd | 36 ++++++++++++++++++++++++++++++++++++ 4 files changed, 55 insertions(+), 9 deletions(-) create mode 100644 R/notin.R create mode 100644 man/notin.Rd diff --git a/NAMESPACE b/NAMESPACE index 57271aa04d..b6065ce34b 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -8,7 +8,7 @@ exportClasses(data.table, IDate, ITime) export(data.table, tables, setkey, setkeyv, key, "key<-", haskey, CJ, SJ, copy) export(setindex, setindexv, indices) export(as.data.table,is.data.table,test.data.table) -export(last,first,like,"%like%","%ilike%","%flike%",between,"%between%",inrange,"%inrange%") +export(last,first,like,"%like%","%ilike%","%flike%",between,"%between%",inrange,"%inrange%", notin, "%notin%") export(timetaken) export(truelength, setalloccol, alloc.col, ":=") export(setattr, setnames, setcolorder, set, setDT, setDF) diff --git a/R/notin.R b/R/notin.R new file mode 100644 index 0000000000..e0f3a1b008 --- /dev/null +++ b/R/notin.R @@ -0,0 +1,6 @@ +# Intended to be used to create %notin% operator +notin = function(example, elements) { + return(!match(example, elements, nomatch = 0)) +} + +"%notin%" = function(example, elements) notin(example, elements) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index c5910f5c81..16f753f7ec 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -4195,7 +4195,7 @@ setNumericRounding(old_rounding) DT = data.table(id=INT(1,2,1), val1=3:1, val2=3:1, val3=list(2:3,4:6,7:10)) # 5380 test(1199.1, DT[, sum(.SD), by=id, .SDcols=2:3], data.table(id=1:2, V1=INT(8,4))) #875 made the .SD case work -test(1199.2, DT[, sum(.SD), by=id], error="data.*frame.*numeric") # this is R's error message so use flexible string pattern to insulate from minor changes in R, #4769 +test(1199.2, DT[, sum(.SD), by=id], error="data.*frame.*numeric") # this is R's error message so use flexible string pattern to insulate from minor changes in R, #4769 test(1199.3, DT[, sum(val3), by=id], error="Type 'list' not supported by GForce sum [(]gsum[)]. Either.*or turn off") # Selection of columns, copy column to maintain the same as R <= 3.0.2, in Rdevel, for now @@ -10442,7 +10442,7 @@ test(1728.12, DT[order(x,na.last=NA)], DT[2]) # was randomly wrong if (test_longdouble) { #3258 old = options(datatable.verbose=FALSE) # capture.output() exact tests must not be polluted with verbosity - + test(1729.01, fwrite(data.table(V1=c(1), V2=c(9.9999999999999982236431605997495353221893310546875))), output="V1,V2\n1,10") test(1729.02, fwrite(data.table(V2=c(9.9999999999999982236431605997495353221893310546875), V1=c(1))), @@ -10522,8 +10522,8 @@ if (test_longdouble) { #3258 # 2.220446e-16 1.110223e-16 2.225074e-308 1.797693e+308 test(1729.12, typeof(DT[[1L]]), "double") test(1729.13, capture.output(fwrite(DT)), capture.output(write.csv(DT,row.names=FALSE,quote=FALSE))) - - options(old) # restore the previous datatable.verbose value, for example for the CRAN_Release test with verbose on + + options(old) # restore the previous datatable.verbose value, for example for the CRAN_Release test with verbose on } if (test_bit64) { @@ -10846,7 +10846,7 @@ if (TZnotUTC) { # from v1.13.0 these tests work when running under non-UTC because they compare to as.POSIXct which reads these unmarked datetime in local # the new tests 2150.* cover more cases # from v1.14.0, the tz="" is needed - test(1743.25, fread("a,b,c\n2015-06-01 11:00:00,1,ae", colClasses=c("POSIXct","integer","character"), tz=""), + test(1743.25, fread("a,b,c\n2015-06-01 11:00:00,1,ae", colClasses=c("POSIXct","integer","character"), tz=""), data.table(a=as.POSIXct("2015-06-01 11:00:00"),b=1L,c="ae")) test(1743.26, fread("a,b,c,d,e,f,g,h\n1,k,2015-06-01 11:00:00,a,1.5,M,9,0", colClasses=list(POSIXct="c", character="b"), drop=c("a","b"), logical01=TRUE, tz=""), ans<-data.table(c=as.POSIXct("2015-06-01 11:00:00"), d="a", e=1.5, f="M", g=9L, h=FALSE)) @@ -17143,7 +17143,7 @@ test(2153.2, DT[, .(list(.GRP)), by=x], data.table(x=1:2, V1=as.list(1:2))) test(2153.3, ans<-DT[, .(list(.NGRP)), by=x], data.table(x=1:2, V1=list(2L,2L))) test(2153.4, address(ans$V1[[1L]]), address(ans$V1[[2L]])) # .NGRP doesn't change group to group so the same object can be referenced many times unlike .N and .GRP test(2153.5, DT[, .(list(c(0L,.N,0L))), by=x], # c() here will create new object so this is ok anyway; i.e. address(.N) is not present in j's result - data.table(x=1:2, V1=list(c(0L,1L,0L), c(0L,2L,0L)))) + data.table(x=1:2, V1=list(c(0L,1L,0L), c(0L,2L,0L)))) # warning message segfault when no column names present, #4644 test(2154.1, fread("0.0\n", colClasses="integer"), data.table(V1=0.0), @@ -17161,7 +17161,7 @@ for (i in 0:4) test(2155+i/10, # dogroups.c eval(j) could create list columns containing altrep references to the specials, #4759 # thanks to revdep testing of 1.13.2 where package tstools revealed this via ts() creating ALTREP, #4758 -# the attr(value,"class")<-"newclass" lines mimics a line at the end of stats::ts(). When the +# the attr(value,"class")<-"newclass" lines mimics a line at the end of stats::ts(). When the # length(value)>=64, R creates an ALTREP REF wrapper. Which dogroups.c now catches. # Hence this test needs to be at least 128 rows, 2 groups of 64 each. DT = data.table(series=c("ts1","ts2"), value=rnorm(128)) @@ -17186,7 +17186,7 @@ test(2158.1, DT[, .(value = list(value)), index], DT = data.table(value=as.list(1:6), index=rep(1:2, each=3)) test(2158.2, DT[, by="index", list(value=list(value))], data.table(index=1:2, value=list(as.list(1:3), as.list(4:6)))) - + # type consistency of empty input to as.matrix.data.table, #4762 DT = data.table(x = 1) test(2159.01, typeof(as.matrix(DT)), "double") @@ -17263,3 +17263,7 @@ if (identical(x, enc2native(x))) { # fintersect now preserves order of first argument like intersect, #4716 test(2163, fintersect(data.table(x=c("b", "c", "a")), data.table(x=c("a","c")))$x, c("c", "a")) + + +# Test new feature %notin%, #4152 +test(2164, 11 %notin% 1:10, TRUE) diff --git a/man/notin.Rd b/man/notin.Rd new file mode 100644 index 0000000000..ba082d5ace --- /dev/null +++ b/man/notin.Rd @@ -0,0 +1,36 @@ +\name{notin} +\alias{notin} +\alias{\%notin\%} + +\title{ +Convenience for checking if an example is in a set of elements +} + +\description{ +Intended to behave opposite to \code{\link[=base]{in}} +} + +\usage{ +notin(example, elements) +example \%notin\% elements +} + +\arguments{ + \item{example}{ vector or \code{NULL}: value to be matched } + \item{elements}{ vector or \code{NULL}: values to check for a match } +} + +\details{ + Internally, \code{\%notin\%} is a wrapper around \code{\link[=base]{match}}, much like \code{\%in\%}. +} + +\value{ + Logical vector, \code{TRUE} indicating whether each \code{example} was found in \code{elements}. +} + +\seealso{ \code{\link[base]{match}} } + +\examples{ + 11 \%notin\% 1:10 +} + From e23fad8851005eeef01c1f4d5c68b8f878d8dcca Mon Sep 17 00:00:00 2001 From: mczekanski1 Date: Sat, 6 Mar 2021 19:35:02 -0500 Subject: [PATCH 02/32] update NEWS to add %notin% --- NEWS.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/NEWS.md b/NEWS.md index a51de94eb6..f17fd6c820 100644 --- a/NEWS.md +++ b/NEWS.md @@ -6,6 +6,8 @@ ## NEW FEATURES +1. %notin% added to compute opposite of %in%, [#4152](https://github.com/Rdatatable/data.table/issues/4152). Thanks to Jan Gorecki for suggesting and Michael Czekanski for the PR. + ## BUG FIXES ## NOTES From 212b0b2bc2fbfeefe13f2dcbdb757c605635cbf2 Mon Sep 17 00:00:00 2001 From: mczekanski1 Date: Sun, 7 Mar 2021 09:25:15 -0500 Subject: [PATCH 03/32] include branching for is.character in response to PR comments --- R/notin.R | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/R/notin.R b/R/notin.R index e0f3a1b008..bdd609b932 100644 --- a/R/notin.R +++ b/R/notin.R @@ -1,6 +1,10 @@ # Intended to be used to create %notin% operator notin = function(example, elements) { - return(!match(example, elements, nomatch = 0)) + if (is.character(example)) { + return(!chmatch(example, elements, nomatch = 0)) + } else { + return(!match(example, elements, nomatch = 0)) + } } "%notin%" = function(example, elements) notin(example, elements) From 5c041c57cb36b60e65886f9a9ecfb631fe4aa4ed Mon Sep 17 00:00:00 2001 From: mczekanski1 Date: Sun, 14 Mar 2021 23:08:55 -0400 Subject: [PATCH 04/32] implement negation of chin in chmatchMain and remove notin function --- NAMESPACE | 2 +- R/data.table.R | 2 +- R/notin.R | 8 +++----- inst/tests/tests.Rraw | 5 ++++- man/notin.Rd | 6 ++---- src/assign.c | 4 ++-- src/chmatch.c | 37 +++++++++++++++++++++++++------------ src/data.table.h | 2 +- src/subset.c | 2 +- 9 files changed, 40 insertions(+), 28 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index b6065ce34b..fdfd765396 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -8,7 +8,7 @@ exportClasses(data.table, IDate, ITime) export(data.table, tables, setkey, setkeyv, key, "key<-", haskey, CJ, SJ, copy) export(setindex, setindexv, indices) export(as.data.table,is.data.table,test.data.table) -export(last,first,like,"%like%","%ilike%","%flike%",between,"%between%",inrange,"%inrange%", notin, "%notin%") +export(last,first,like,"%like%","%ilike%","%flike%",between,"%between%",inrange,"%inrange%","%notin%") export(timetaken) export(truelength, setalloccol, alloc.col, ":=") export(setattr, setnames, setcolorder, set, setDT, setDF) diff --git a/R/data.table.R b/R/data.table.R index 2b010db77a..638d2043d4 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -2585,7 +2585,7 @@ chmatchdup = function(x, table, nomatch=NA_integer_) .Call(Cchmatchdup, x, table, as.integer(nomatch[1L])) "%chin%" = function(x, table) - .Call(Cchin, x, table) # TO DO if table has 'ul' then match to that + .Call(Cchin, x, table, FALSE) # TO DO if table has 'ul' then match to that chorder = function(x) { o = forderv(x, sort=TRUE, retGrp=FALSE) diff --git a/R/notin.R b/R/notin.R index bdd609b932..688979fadd 100644 --- a/R/notin.R +++ b/R/notin.R @@ -1,10 +1,8 @@ # Intended to be used to create %notin% operator -notin = function(example, elements) { +"%notin%" = function(example, elements) { if (is.character(example)) { - return(!chmatch(example, elements, nomatch = 0)) + return(.Call(Cchin, example, elements, TRUE)) } else { return(!match(example, elements, nomatch = 0)) } -} - -"%notin%" = function(example, elements) notin(example, elements) +} \ No newline at end of file diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 16f753f7ec..db9e0e2b09 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17266,4 +17266,7 @@ test(2163, fintersect(data.table(x=c("b", "c", "a")), data.table(x=c("a","c")))$ # Test new feature %notin%, #4152 -test(2164, 11 %notin% 1:10, TRUE) +test(2164.1, 11 %notin% 1:10, TRUE) +test(2164.2, "a" %notin% c(), TRUE) +test(2164.3, "a" %notin% c("a", "b", "c"), FALSE) +test(2164.4, c(1, 2) %notin% c(1,2,3), c(FALSE, FALSE)) \ No newline at end of file diff --git a/man/notin.Rd b/man/notin.Rd index ba082d5ace..d28e4b6595 100644 --- a/man/notin.Rd +++ b/man/notin.Rd @@ -1,5 +1,4 @@ \name{notin} -\alias{notin} \alias{\%notin\%} \title{ @@ -11,7 +10,6 @@ Intended to behave opposite to \code{\link[=base]{in}} } \usage{ -notin(example, elements) example \%notin\% elements } @@ -21,11 +19,11 @@ example \%notin\% elements } \details{ - Internally, \code{\%notin\%} is a wrapper around \code{\link[=base]{match}}, much like \code{\%in\%}. + Internally, \code{\%notin\%} is a wrapper around \code{\link[=base]{match}} and \code{\link[=data.table]{chmatch}}. } \value{ - Logical vector, \code{TRUE} indicating whether each \code{example} was found in \code{elements}. + Logical vector, \code{TRUE} indicating whether each \code{example} was not found in \code{elements}. } \seealso{ \code{\link[base]{match}} } diff --git a/src/assign.c b/src/assign.c index 5c0b808707..f39def1981 100644 --- a/src/assign.c +++ b/src/assign.c @@ -525,7 +525,7 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values) if (length(key)) { // if assigning to at least one key column, the key is truncated to one position before the first changed column. //any() and subsetVector() don't seem to be exposed by R API at C level, so this is done here long hand. - PROTECT(tmp = chin(key, assignedNames)); protecti++; + PROTECT(tmp = chin(key, assignedNames, false)); protecti++; newKeyLength = xlength(key); for (i=0;i=0; + } + } else { + for (int i=0; i Date: Mon, 15 Mar 2021 00:32:27 -0700 Subject: [PATCH 05/32] tidy --- R/notin.R | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/R/notin.R b/R/notin.R index 688979fadd..4ad5a8514b 100644 --- a/R/notin.R +++ b/R/notin.R @@ -1,8 +1,7 @@ -# Intended to be used to create %notin% operator "%notin%" = function(example, elements) { if (is.character(example)) { return(.Call(Cchin, example, elements, TRUE)) } else { return(!match(example, elements, nomatch = 0)) } -} \ No newline at end of file +} From 7c60aff628179500a2d0cf4c12708419c69a8fab Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Mon, 15 Mar 2021 00:33:32 -0700 Subject: [PATCH 06/32] terminal newline --- inst/tests/tests.Rraw | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index db9e0e2b09..ece16bd5bc 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17269,4 +17269,4 @@ test(2163, fintersect(data.table(x=c("b", "c", "a")), data.table(x=c("a","c")))$ test(2164.1, 11 %notin% 1:10, TRUE) test(2164.2, "a" %notin% c(), TRUE) test(2164.3, "a" %notin% c("a", "b", "c"), FALSE) -test(2164.4, c(1, 2) %notin% c(1,2,3), c(FALSE, FALSE)) \ No newline at end of file +test(2164.4, c(1, 2) %notin% c(1,2,3), c(FALSE, FALSE)) From 99e7ad3694234da7b684df16c268fb892eb02a8a Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Mon, 15 Mar 2021 00:39:52 -0700 Subject: [PATCH 07/32] tighten & emphasize wording Somewhat knotty here -- `TRUE` means "no" and `FALSE` means "yes", in a way, so want to tread carefully --- man/notin.Rd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/man/notin.Rd b/man/notin.Rd index d28e4b6595..27356e780a 100644 --- a/man/notin.Rd +++ b/man/notin.Rd @@ -23,7 +23,7 @@ example \%notin\% elements } \value{ - Logical vector, \code{TRUE} indicating whether each \code{example} was not found in \code{elements}. + Logical vector, \code{TRUE} for each element of \code{example} \emph{absent} from \code{elements}, and \code{FALSE} for each element of \code{example} \emph{present} in \code{elements}. } \seealso{ \code{\link[base]{match}} } From 2d622849a05f1f3433a2a00a161717f0e8e77722 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Mon, 15 Mar 2021 00:41:48 -0700 Subject: [PATCH 08/32] whitespace --- src/chmatch.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/chmatch.c b/src/chmatch.c index 59238eaf3f..75e45924de 100644 --- a/src/chmatch.c +++ b/src/chmatch.c @@ -21,9 +21,9 @@ static SEXP chmatchMain(SEXP x, SEXP table, int nomatch, bool chin, bool chmatch } // negate inputs if needed - int chinNoMatch = negate?1:0; - int match = negate?0:1; - nomatch = negate?1:nomatch; + int chinNoMatch = negate ? 1 : 0; + int match = negate ? 0 : 1; + nomatch = negate ? 1 : nomatch; // allocations up front before savetl starts in case allocs fail int nprotect=0; From f652847fbde3d4eb7a110d77b07a39d37a7956ce Mon Sep 17 00:00:00 2001 From: mczekanski1 Date: Mon, 15 Mar 2021 19:16:15 -0400 Subject: [PATCH 09/32] change parameter names, update documentation, and add tests for edge cases --- R/notin.R | 6 +++--- inst/tests/tests.Rraw | 3 +++ man/notin.Rd | 21 ++++++++++----------- 3 files changed, 16 insertions(+), 14 deletions(-) diff --git a/R/notin.R b/R/notin.R index 4ad5a8514b..ae9224571b 100644 --- a/R/notin.R +++ b/R/notin.R @@ -1,7 +1,7 @@ -"%notin%" = function(example, elements) { +"%notin%" = function(x, table) { if (is.character(example)) { - return(.Call(Cchin, example, elements, TRUE)) + return(.Call(Cchin, x, table, TRUE)) } else { - return(!match(example, elements, nomatch = 0)) + return(!match(x, table, nomatch = 0)) } } diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index b41fb0d478..8226d46662 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17270,3 +17270,6 @@ test(2164.1, 11 %notin% 1:10, TRUE) test(2164.2, "a" %notin% c(), TRUE) test(2164.3, "a" %notin% c("a", "b", "c"), FALSE) test(2164.4, c(1, 2) %notin% c(1,2,3), c(FALSE, FALSE)) +test(2164.5, "a" %notin% character(), TRUE) +test(2164.6, "a" %notin% integer(), TRUE) +test(2164.7, "a" %notin% NULL, TRUE) diff --git a/man/notin.Rd b/man/notin.Rd index 27356e780a..787f3fd853 100644 --- a/man/notin.Rd +++ b/man/notin.Rd @@ -2,33 +2,32 @@ \alias{\%notin\%} \title{ -Convenience for checking if an example is in a set of elements +Convenience for checking if an example is not in a set of elements } \description{ -Intended to behave opposite to \code{\link[=base]{in}} +Check whether an object is absent from a table, i.e., the logical inverse of \code{\link[=base]{in}}. } \usage{ -example \%notin\% elements +x \%notin\% table } \arguments{ - \item{example}{ vector or \code{NULL}: value to be matched } - \item{elements}{ vector or \code{NULL}: values to check for a match } + \item{x}{ vector or \code{NULL}: value to be matched } + \item{table}{ vector or \code{NULL}: values to check for a match } } -\details{ - Internally, \code{\%notin\%} is a wrapper around \code{\link[=base]{match}} and \code{\link[=data.table]{chmatch}}. -} \value{ - Logical vector, \code{TRUE} for each element of \code{example} \emph{absent} from \code{elements}, and \code{FALSE} for each element of \code{example} \emph{present} in \code{elements}. + Logical vector, \code{TRUE} for each element of \code{x} \emph{absent} from \code{table}, and \code{FALSE} for each element of \code{x} \emph{present} in \code{table}. } -\seealso{ \code{\link[base]{match}} } +\seealso{ \code{\link[base]{match}}, \code{\link[data.table]{chmatch}} } + \examples{ - 11 \%notin\% 1:10 + 11 \%notin\% 1:10 # TRUE + "a" \%notin\% c("a", "b") # FALSE } From ca4b779a39776cf17ccf7a3144cdc981df49da0a Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Sun, 2 May 2021 01:54:56 -0700 Subject: [PATCH 10/32] grammar --- man/notin.Rd | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/man/notin.Rd b/man/notin.Rd index 787f3fd853..d84bb2024d 100644 --- a/man/notin.Rd +++ b/man/notin.Rd @@ -2,7 +2,7 @@ \alias{\%notin\%} \title{ -Convenience for checking if an example is not in a set of elements +Convenience operator for checking if an example is not in a set of elements } \description{ @@ -14,8 +14,8 @@ x \%notin\% table } \arguments{ - \item{x}{ vector or \code{NULL}: value to be matched } - \item{table}{ vector or \code{NULL}: values to check for a match } + \item{x}{ Vector or \code{NULL}: the values to be matched. } + \item{table}{ Vector or \code{NULL}: the values to be matched against. } } From 37629481b57cd1d873dc3c0e10abdfd4ac0e63b9 Mon Sep 17 00:00:00 2001 From: Michael Date: Sat, 8 May 2021 14:47:56 -0400 Subject: [PATCH 11/32] add tests for NA --- inst/tests/tests.Rraw | 2 ++ 1 file changed, 2 insertions(+) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 788aab0364..f3adef959d 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17281,4 +17281,6 @@ test(2165.4, c(1, 2) %notin% c(1,2,3), c(FALSE, FALSE)) test(2165.5, "a" %notin% character(), TRUE) test(2165.6, "a" %notin% integer(), TRUE) test(2165.7, "a" %notin% NULL, TRUE) +test(2165.8, NA %notin% 1:5, TRUE) +test(2165.9, NA %notin% c(1:5, NA), FALSE) From 48faf69d9fb619b97f08b0ed3c20b597a3a84831 Mon Sep 17 00:00:00 2001 From: Michael Date: Sun, 20 Jun 2021 18:58:57 -0400 Subject: [PATCH 12/32] merge with recent data.table changes --- .Rbuildignore | 2 + .dev/CRAN_Release.cmd | 24 +- .dev/revdep.R | 13 +- .gitattributes | 2 + .gitlab-ci.yml | 22 +- DESCRIPTION | 7 +- NAMESPACE | 1 + NEWS.md | 146 +++- R/IDateTime.R | 16 +- R/as.data.table.R | 19 +- R/between.R | 12 +- R/bmerge.R | 52 +- R/cedta.R | 6 +- R/data.table.R | 323 +++++---- R/devel.R | 8 +- R/duplicated.R | 16 +- R/fcast.R | 2 +- R/fmelt.R | 177 ++++- R/foverlaps.R | 4 +- R/frank.R | 7 +- R/fread.R | 56 +- R/fwrite.R | 13 +- R/groupingsets.R | 8 +- R/last.R | 28 +- R/like.R | 5 +- R/merge.R | 32 +- R/onAttach.R | 11 +- R/onLoad.R | 19 +- R/print.data.table.R | 39 +- R/setkey.R | 14 +- R/setops.R | 22 +- R/tables.R | 4 +- R/test.data.table.R | 74 +- R/utils.R | 57 +- R/xts.R | 6 +- _pkgdown.yml | 4 +- inst/include/datatableAPI.h | 5 +- inst/tests/benchmark.Rraw | 4 +- inst/tests/other.Rraw | 6 +- inst/tests/programming.Rraw | 600 ++++++++++++++++ inst/tests/tests.Rraw | 727 ++++++++++++++++---- man/address.Rd | 9 +- man/assign.Rd | 4 +- man/cdt.Rd | 17 +- man/copy.Rd | 10 +- man/data.table.Rd | 14 +- man/dcast.data.table.Rd | 36 +- man/deprecated.Rd | 3 + man/fcase.Rd | 2 +- man/fifelse.Rd | 4 +- man/fread.Rd | 3 +- man/froll.Rd | 82 +-- man/fwrite.Rd | 7 +- man/measure.Rd | 92 +++ man/melt.data.table.Rd | 75 +- man/openmp-utils.Rd | 5 +- man/shouldPrint.Rd | 4 +- man/special-symbols.Rd | 8 +- man/substitute2.Rd | 77 +++ man/test.data.table.Rd | 5 + po/R-data.table.pot | 330 +++++++-- po/R-zh_CN.po | 554 ++++++++++++--- po/zh_CN.po | 10 +- src/assign.c | 101 ++- src/chmatch.c | 9 +- src/data.table.h | 7 +- src/dogroups.c | 16 +- src/fastmean.c | 18 +- src/fcast.c | 14 +- src/fifelse.c | 211 +++--- src/fmelt.c | 364 ++++++---- src/forder.c | 6 +- src/frank.c | 70 +- src/fread.c | 45 +- src/fread.h | 2 +- src/freadR.c | 12 +- src/froll.c | 12 +- src/fsort.c | 10 +- src/fwriteR.c | 17 +- src/gsumm.c | 292 ++++---- src/ijoin.c | 224 +++--- src/init.c | 11 +- src/inrange.c | 15 +- src/nqrecreateindices.c | 2 +- src/openmp-utils.c | 2 +- src/programming.c | 32 + src/rbindlist.c | 2 +- src/snprintf.c | 30 +- src/subset.c | 2 +- src/utils.c | 8 +- vignettes/Makefile | 7 - vignettes/datatable-faq.Rmd | 17 +- vignettes/datatable-intro.Rmd | 2 - vignettes/datatable-keys-fast-subset.Rmd | 6 +- vignettes/datatable-programming.Rmd | 420 +++++++++++ vignettes/datatable-reference-semantics.Rmd | 14 +- vignettes/datatable-reshape.Rmd | 95 ++- vignettes/datatable-sd-usage.Rmd | 8 +- 98 files changed, 4549 insertions(+), 1498 deletions(-) create mode 100644 inst/tests/programming.Rraw create mode 100644 man/measure.Rd create mode 100644 man/substitute2.Rd create mode 100644 src/programming.c delete mode 100644 vignettes/Makefile create mode 100644 vignettes/datatable-programming.Rmd diff --git a/.Rbuildignore b/.Rbuildignore index ad51ae2da7..9a939aae81 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -1,3 +1,4 @@ +.dir-locals.el ^\.Rprofile$ ^data\.table_.*\.tar\.gz$ ^vignettes/plots/figures$ @@ -31,6 +32,7 @@ ^.*\.Rproj$ ^\.Rproj\.user$ ^\.idea$ +^\.libs$ ^.*\.dll$ diff --git a/.dev/CRAN_Release.cmd b/.dev/CRAN_Release.cmd index a2db3058b3..1dfec0a02a 100644 --- a/.dev/CRAN_Release.cmd +++ b/.dev/CRAN_Release.cmd @@ -154,14 +154,24 @@ grep -n "[^A-Za-z0-9]F[^A-Za-z0-9]" ./inst/tests/tests.Rraw grep -Enr "^[^#]*(?:\[|==|>|<|>=|<=|,|\(|\+)\s*[-]?[0-9]+[^0-9L:.e]" R | grep -Ev "stop|warning|tolerance" # Never use ifelse. fifelse for vectors when necessary (nothing yet) - grep -Enr "\bifelse" R +grep -Enr "\bifelse" R + +# use substr() instead of substring(), #4447 +grep -Fnr "substring" R # No system.time in main tests.Rraw. Timings should be in benchmark.Rraw -grep -n "system[.]time" ./inst/tests/tests.Rraw +grep -Fn "system.time" ./inst/tests/*.Rraw | grep -Fv "benchmark.Rraw" | grep -Fv "this system.time usage ok" + +# No tryCatch in *.Rraw -- tryCatch should be handled only in test() itself to avoid silently missed warnings/errors/output +grep -Fn "tryCatch" ./inst/tests/*.Rraw # All % in *.Rd should be escaped otherwise text gets silently chopped grep -n "[^\]%" ./man/*.Rd +# if (a & b) is either invalid or inefficient (ditto for replace & with |); +# if(any(a [&|] b)) is appropriate b/c of collapsing the logical vector to scalar +grep -nr "^[^#]*if[^&#]*[^&#\"][&][^&]" R | grep -Ev "if\s*[(](?:any|all)" + # seal leak potential where two unprotected API calls are passed to the same # function call, usually involving install() or mkChar() # Greppable thanks to single lines and wide screens @@ -196,6 +206,10 @@ grep allocVector *.c | grep -v PROTECT | grep -v SET_VECTOR_ELT | grep -v setAtt grep coerceVector *.c | grep -v PROTECT | grep -v SET_VECTOR_ELT | grep -v setAttrib | grep -v return grep asCharacter *.c | grep -v PROTECT | grep -v SET_VECTOR_ELT | grep -v setAttrib | grep -v return +# Enforce local scope for loop index (`for (int i=0; ...)` instead of `int i; for (i=0; ...)`) +# exceptions are tagged with #loop_counter_not_local_scope_ok +grep -En "for\s*[(]\s*[a-zA-Z0-9_]+\s*=" src/*.c | grep -Fv "#loop_counter_not_local_scope_ok" + cd .. R cc(test=TRUE, clean=TRUE, CC="gcc-10") # to compile with -pedandic -Wall, latest gcc as CRAN: https://cran.r-project.org/web/checks/check_flavors.html @@ -243,6 +257,11 @@ require(data.table) test.data.table(script="other.Rraw") test.data.table(script="*.Rraw") test.data.table(verbose=TRUE) # since main.R no longer tests verbose mode + +# check example() works on every exported function, with these sticter options too, and also that all help pages have examples +options(warn=2, warnPartialMatchArgs=TRUE, warnPartialMatchAttr=TRUE, warnPartialMatchDollar=TRUE) +invisible(lapply(objects(pos="package:data.table"), example, character.only=TRUE, echo=FALSE, ask=FALSE)) + gctorture2(step=50) system.time(test.data.table(script="*.Rraw")) # apx 8h = froll 3h + nafill 1m + main 5h @@ -530,6 +549,7 @@ sudo apt-get -y install libquantlib0-dev # for RQuantLib sudo apt-get -y install cargo # for gifski, a suggest of nasoi sudo apt-get -y install libgit2-dev # for gert sudo apt-get -y install cmake # for symengine for RxODE +sudo apt-get -y install libxslt1-dev # for xslt sudo R CMD javareconf # ENDIF diff --git a/.dev/revdep.R b/.dev/revdep.R index 49aa6e06f9..38c5a93a66 100644 --- a/.dev/revdep.R +++ b/.dev/revdep.R @@ -9,7 +9,7 @@ Sys.unsetenv("R_PROFILE_USER") # options copied from .dev/.Rprofile that aren't run due to the way this script is started via a profile options(help_type="html") -options(error=quote(dump.frames())) +options(error=quote(utils::dump.frames())) options(width=200) # for cran() output not to wrap # Check that env variables have been set correctly: @@ -36,10 +36,12 @@ stopifnot(identical(Sys.getenv("_R_CHECK_FORCE_SUGGESTS_"),"true")) # e.g. https://github.com/reimandlab/ActivePathways/issues/14 cflags = system("grep \"^[^#]*CFLAGS\" ~/.R/Makevars", intern=TRUE) -cat("~/.R/Makevars contains", cflags, "ok\n") -if (!grepl("^CFLAGS=-O[0-3]$", cflags)) { +cat("~/.R/Makevars contains", cflags) +if (!grepl("^CFLAGS=-O[0-3] *$", cflags)) { stop("Some packages have failed to install in the past (e.g. processx and RGtk2) when CFLAGS contains -pedandic, -Wall, and similar. ", - "So for revdepr keep CFLAGS simple; i.e. -O[0-3] only.") + "So for revdepr keep CFLAGS simple; i.e. -O[0-3] only. Check ~/.R/Makevars.") +} else { + cat(" ok\n") } options(repos = c("CRAN"=c("http://cloud.r-project.org"))) @@ -155,7 +157,7 @@ status0 = function(bioc=FALSE) { if (file.exists(fn)) { v = suppressWarnings(system(paste0("grep 'Status:' ",fn), intern=TRUE)) if (!length(v)) return("RUNNING") - return(substring(v,9)) + return(substr(v, 9L, nchar(v))) } if (file.exists(paste0("./",x,".Rcheck"))) return("RUNNING") return("NOT STARTED") @@ -248,7 +250,6 @@ cran = function() # reports CRAN status of the .cran.fail packages cat("tools::CRAN_check_results() returned",prettyNum(nrow(db), big.mark=","),"rows in",timetaken(p),"\n") rel = unique(db$Flavor) rel = sort(rel[grep("release",rel)]) - stopifnot(identical(rel, c("r-release-linux-x86_64", "r-release-macos-x86_64", "r-release-windows-ix86+x86_64"))) cat("R-release is used for revdep checking so comparing to CRAN results for R-release\n") ans = db[Package %chin% .fail.cran & Flavor %chin% rel, Status, keyby=.(Package, Flavor)] dcast(ans, Package~Flavor, value.var="Status", fill="")[.fail.cran,] diff --git a/.gitattributes b/.gitattributes index fa1385d99a..9c72b27aea 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1 +1,3 @@ * -text +*.Rraw linguist-language=R + diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 2f760c2782..d36f99fbcd 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -6,9 +6,9 @@ variables: TZ: "UTC" ## to avoid 'Failed to create bus connection' from timedatectl via Sys.timezone() on Docker with R 3.4. ## Setting TZ for all GLCI jobs to isolate them from timezone. We could have a new GLCI job to test under ## a non-UTC timezone, although, that's what we do routinely in dev. - R_REL_VERSION: "4.0" - R_DEVEL_VERSION: "4.1" - R_OLDREL_VERSION: "3.6" + R_REL_VERSION: "4.1" + R_DEVEL_VERSION: "4.2" + R_OLDREL_VERSION: "4.0" stages: - dependencies @@ -61,7 +61,7 @@ build: ## build data.table sources as tar.gz archive image: registry.gitlab.com/jangorecki/dockerfiles/r-builder needs: ["mirror-packages"] before_script: - - Rscript -e 'install.packages("knitr", repos=file.path("file:",normalizePath("bus/mirror-packages/cran")), quiet=TRUE)' + - Rscript -e 'install.packages(c("knitr","rmarkdown"), repos=file.path("file:",normalizePath("bus/mirror-packages/cran")), quiet=TRUE)' - rm -r bus - echo "Revision:" $CI_BUILD_REF >> ./DESCRIPTION script: @@ -96,16 +96,14 @@ build: ## build data.table sources as tar.gz archive - mkdir.exe -p cran/bin/windows/contrib/$R_VERSION; mv.exe $(ls.exe -1t data.table_*.zip | head.exe -n 1) cran/bin/windows/contrib/$R_VERSION .test-install-r-rel-win: &install-r-rel-win - - curl.exe -s -o ../R-rel.exe https://cloud.r-project.org/bin/windows/base/old/4.0.3/R-4.0.3-win.exe; Start-Process -FilePath ..\R-rel.exe -ArgumentList "/VERYSILENT /DIR=C:\R" -NoNewWindow -Wait + - curl.exe -s -o ../R-rel.exe https://cloud.r-project.org/bin/windows/base/old/4.1.0/R-4.1.0-win.exe; Start-Process -FilePath ..\R-rel.exe -ArgumentList "/VERYSILENT /DIR=C:\R" -NoNewWindow -Wait .test-install-r-devel-win: &install-r-devel-win - curl.exe -s -o ../R-devel.exe https://cloud.r-project.org/bin/windows/base/R-devel-win.exe; Start-Process -FilePath ..\R-devel.exe -ArgumentList "/VERYSILENT /DIR=C:\R" -NoNewWindow -Wait .test-install-r-oldrel-win: &install-r-oldrel-win - - curl.exe -s -o ../R-oldrel.exe https://cloud.r-project.org/bin/windows/base/old/3.6.3/R-3.6.3-win.exe; Start-Process -FilePath ..\R-oldrel.exe -ArgumentList "/VERYSILENT /DIR=C:\R" -NoNewWindow -Wait + - curl.exe -s -o ../R-oldrel.exe https://cloud.r-project.org/bin/windows/base/old/4.0.5/R-4.0.5-win.exe; Start-Process -FilePath ..\R-oldrel.exe -ArgumentList "/VERYSILENT /DIR=C:\R" -NoNewWindow -Wait .test-install-rtools-win: &install-rtools-win - curl.exe -s -o ../rtools.exe https://cloud.r-project.org/bin/windows/Rtools/rtools40-x86_64.exe; Start-Process -FilePath ..\rtools.exe -ArgumentList "/VERYSILENT /DIR=C:\rtools40" -NoNewWindow -Wait -.test-install-rtools35-win: &install-rtools35-win - - curl.exe -s -o ../Rtools35.exe https://cloud.r-project.org/bin/windows/Rtools/Rtools35.exe; Start-Process -FilePath ..\Rtools35.exe -ArgumentList "/VERYSILENT /DIR=C:\Rtools" -NoNewWindow -Wait .test-template: &test stage: test @@ -191,7 +189,7 @@ test-rel-cran-lin: ## R-release on Linux, extra NOTEs check and build pdf manual variables: _R_CHECK_CRAN_INCOMING_: "TRUE" ## stricter --as-cran checks should run in dev pipelines continuously (not sure what they are though) _R_CHECK_CRAN_INCOMING_REMOTE_: "FALSE" ## Other than no URL checking (takes many minutes) or 'Days since last update 0' NOTEs needed, #3284 - _R_CHECK_CRAN_INCOMING_TARBALL_THRESHOLD_: "7500000" ## effective from R 4.1.0, then 00check.log can be checked for "OK" rather than "2 NOTEs" + _R_CHECK_CRAN_INCOMING_TARBALL_THRESHOLD_: "7500000" ## effective from R 4.1.0 before_script: - *install-deps - *cp-src @@ -205,7 +203,7 @@ test-rel-cran-lin: ## R-release on Linux, extra NOTEs check and build pdf manual - R CMD check --as-cran $(ls -1t data.table_*.tar.gz | head -n 1) - *rm-src - >- - Rscript -e 'l=tail(readLines("data.table.Rcheck/00check.log"), 1L); if (!identical(l, "Status: 2 NOTEs")) stop("Last line of ", shQuote("00check.log"), " is not ", shQuote("Status: 2 NOTEs"), " (size of tarball) but ", shQuote(l)) else q("no")' + Rscript -e 'l=tail(readLines("data.table.Rcheck/00check.log"), 1L); if (!identical(l, "Status: 1 NOTE")) stop("Last line of ", shQuote("00check.log"), " is not ", shQuote("Status: 1 NOTE"), " (installed package size) but ", shQuote(l)) else q("no")' test-dev-cran-lin: ## R-devel on Linux, --enable-strict-barrier --disable-long-double, check for new notes and compilation warnings, thus allow_failure <<: *test-lin @@ -285,8 +283,8 @@ test-old-win: ## R-oldrel on Windows R_VERSION: "$R_OLDREL_VERSION" before_script: - *install-r-oldrel-win - - *install-rtools35-win - - $ENV:PATH = "C:\R\bin;C:\Rtools\bin;$ENV:PATH" + - *install-rtools-win + - $ENV:PATH = "C:\R\bin;C:\rtools40\usr\bin;$ENV:PATH" - *install-deps-win - *cp-src-win - rm.exe -r bus diff --git a/DESCRIPTION b/DESCRIPTION index 78ca52b485..8ab2deaa0d 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -61,10 +61,13 @@ Authors@R: c( person("Vaclav","Tlapak", role="ctb"), person("Kevin","Ushey", role="ctb"), person("Dirk","Eddelbuettel", role="ctb"), - person("Ben","Schwen", role="ctb")) + person("Ben","Schwen", role="ctb"), + person("Tony","Fischetti", role="ctb"), + person("Ofek","Shilon", role="ctb"), + person("Vadim","Khotilovich", role="ctb")) Depends: R (>= 3.1.0) Imports: methods -Suggests: bit64 (>= 4.0.0), bit (>= 4.0.4), curl, R.utils, xts, nanotime, zoo (>= 1.8-1), yaml, knitr, rmarkdown +Suggests: bit64 (>= 4.0.0), bit (>= 4.0.4), curl, R.utils, xts, nanotime, zoo (>= 1.8-1), yaml, knitr, rmarkdown, markdown SystemRequirements: zlib Description: Fast aggregation of large data (e.g. 100GB in RAM), fast ordered joins, fast add/modify/delete of columns by group using no copies at all, list columns, friendly and fast character-separated-value read/write. Offers a natural and flexible syntax, for faster development. License: MPL-2.0 | file LICENSE diff --git a/NAMESPACE b/NAMESPACE index fdfd765396..0aa68631e5 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -56,6 +56,7 @@ export(nafill) export(setnafill) export(.Last.updated) export(fcoalesce) +export(substitute2) S3method("[", data.table) S3method("[<-", data.table) diff --git a/NEWS.md b/NEWS.md index f5e0ee0527..b150f4236c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -11,8 +11,131 @@ 2. `mean(na.rm=TRUE)` by group is now GForce optimized, [#4849](https://github.com/Rdatatable/data.table/issues/4849). Thanks to the [h2oai/db-benchmark](https://github.com/h2oai/db-benchmark) project for spotting this issue. The 1 billion row example in the issue shows 48s reduced to 14s. The optimization also applies to type `integer64` resulting in a difference to the `bit64::mean.integer64` method: `data.table` returns a `double` result whereas `bit64` rounds the mean to the nearest integer. +3. `fwrite()` now writes UTF-8 or native csv files by specifying the `encoding=` argument, [#1770](https://github.com/Rdatatable/data.table/pull/1770). Thanks to @shrektan for the request and the PR. + +4. `data.table()` no longer fills empty vectors with `NA` with warning. Instead a 0-row `data.table` is returned, [#3727](https://github.com/Rdatatable/data.table/issues/3727). Since `data.table()` is used internally by `.()`, this brings the following examples in line with expectations in most cases. Thanks to @shrektan for the suggestion and PR. + + ```R + DT = data.table(A=1:3, B=letters[1:3]) + DT[A>3, .(ITEM='A>3', A, B)] # (1) + DT[A>3][, .(ITEM='A>3', A, B)] # (2) + # the above are now equivalent as expected and return: + Empty data.table (0 rows and 3 cols): ITEM,A,B + # Previously, (2) returned : + ITEM A B + + 1: A>3 NA + Warning messages: + 1: In as.data.table.list(jval, .named = NULL) : + Item 2 has 0 rows but longest item has 1; filled with NA + 2: In as.data.table.list(jval, .named = NULL) : + Item 3 has 0 rows but longest item has 1; filled with NA + ``` + + ```R + DT = data.table(A=1:3, B=letters[1:3], key="A") + DT[.(1:3, double()), B] + # new result : + character(0) + # old result : + [1] "a" "b" "c" + Warning message: + In as.data.table.list(i) : + Item 2 has 0 rows but longest item has 3; filled with NA + ``` + +5. `%like%` on factors with a large number of levels is now faster, [#4748](https://github.com/Rdatatable/data.table/issues/4748). The example in the PR shows 2.37s reduced to 0.86s on a factor length 100 million containing 1 million unique 10-character strings. Thanks to @statquant for reporting, and @shrektan for implementing. + +6. `keyby=` now accepts `TRUE`/`FALSE` together with `by=`, [#4307](https://github.com/Rdatatable/data.table/issues/4307). The primary motivation is benchmarking where `by=` vs `keyby=` is varied across a set of queries. Thanks to Jan Gorecki for the request and the PR. + + ```R + DT[, sum(colB), keyby="colA"] + DT[, sum(colB), by="colA", keyby=TRUE] # same + ``` + +7. `fwrite()` gains a new `datatable.fwrite.sep` option to change the default separator, still `","` by default. Thanks to Tony Fischetti for the PR. As is good practice in R in general, we usually resist new global options for the reason that a user changing the option for their own code can inadvertently change the behaviour of any package using `data.table` too. However, in this case, the global option affects file output rather than code behaviour. In fact, the very reason the user may wish to change the default separator is that they know a different separator is more appropriate for their data being passed to the package using `fwrite` but cannot otherwise change the `fwrite` call within that package. + +8. `melt()` now supports `NA` entries when specifying a list of `measure.vars`, which translate into runs of missing values in the output. Useful for melting wide data with some missing columns, [#4027](https://github.com/Rdatatable/data.table/issues/4027). Thanks to @vspinu for reporting, and @tdhock for implementing. + +9. `melt()` now supports multiple output variable columns via the `variable_table` attribute of `measure.vars`, [#3396](https://github.com/Rdatatable/data.table/issues/3396) [#2575](https://github.com/Rdatatable/data.table/issues/2575) [#2551](https://github.com/Rdatatable/data.table/issues/2551), [#4998](https://github.com/Rdatatable/data.table/issues/4998). It should be a `data.table` with one row that describes each element of the `measure.vars` vector(s). These data/columns are copied to the output instead of the usual variable column. This is backwards compatible since the previous behavior (one output variable column) is used when there is no `variable_table`. New functions `measure()` and `measurev()` which use either a separator or a regex to create a `measure.vars` list/vector with `variable_table` attribute; useful for melting data that has several distinct pieces of information encoded in each column name. See new `?measure` and new section in reshape vignette. Thanks to Matthias Gomolka, Ananda Mahto, Hugh Parsonage, Mark Fairbanks for reporting, and to @tdhock for implementing. + +10. A new interface for _programming on data.table_ has been added, closing [#2655](https://github.com/Rdatatable/data.table/issues/2655) and many other linked issues. It is built using base R's `substitute`-like interface via a new `env` argument to `[.data.table`. For details see the new vignette *programming on data.table*, and the new `?substitute2` manual page. Thanks to numerous users for filing requests, and Jan Gorecki for implementing. + + ```R + DT = data.table(x = 1:5, y = 5:1) + + # parameters + in_col_name = "x" + fun = "sum" + fun_arg1 = "na.rm" + fun_arg1val = TRUE + out_col_name = "sum_x" + + # parameterized query + #DT[, .(out_col_name = fun(in_col_name, fun_arg1=fun_arg1val))] + + # desired query + DT[, .(sum_x = sum(x, na.rm=TRUE))] + + # new interface + DT[, .(out_col_name = fun(in_col_name, fun_arg1=fun_arg1val)), + env = list( + in_col_name = "x", + fun = "sum", + fun_arg1 = "na.rm", + fun_arg1val = TRUE, + out_col_name = "sum_x" + )] + ``` + +11. `DT[, if (...) .(a=1L) else .(a=1L, b=2L), by=group]` now returns a 1-column result with warning `j may not evaluate to the same number of columns for each group`, rather than error `'names' attribute [2] must be the same length as the vector`, [#4274](https://github.com/Rdatatable/data.table/issues/4274). Thanks to @robitalec for reporting, and Michael Chirico for the PR. + +12. Typo checking in `i` available since 1.11.4 is extended to work in non-English sessions, [#4989](https://github.com/Rdatatable/data.table/issues/4989). Thanks to Michael Chirico for the PR. + +13. `fifelse()` now coerces logical `NA` to other types and the `na` argument supports vectorized input, [#4277](https://github.com/Rdatatable/data.table/issues/4277) [#4286](https://github.com/Rdatatable/data.table/issues/4286) [#4287](https://github.com/Rdatatable/data.table/issues/4287). Thanks to @michaelchirico and @shrektan for reporting, and @shrektan for implementing. + ## BUG FIXES +1. `by=.EACHI` when `i` is keyed but `on=` different columns than `i`'s key could create an invalidly keyed result, [#4603](https://github.com/Rdatatable/data.table/issues/4603) [#4911](https://github.com/Rdatatable/data.table/issues/4911). Thanks to @myoung3 and @adamaltmejd for reporting, and @ColeMiller1 for the PR. An invalid key is where a `data.table` is marked as sorted by the key columns but the data is not sorted by those columns, leading to incorrect results from subsequent queries. + +2. `print(DT, trunc.cols=TRUE)` and the corresponding `datatable.print.trunc.cols` option (new feature 3 in v1.13.0) could incorrectly display an extra column, [#4266](https://github.com/Rdatatable/data.table/issues/4266). Thanks to @tdhock for the bug report and @MichaelChirico for the PR. + +3. `fread(..., nrows=0L)` now works as intended and the same as `nrows=0`; i.e. returning the column names and typed empty columns determined by the large sample, [#4686](https://github.com/Rdatatable/data.table/issues/4686). Thanks to @hongyuanjia for reporting, and Benjamin Schwendinger for the PR. + +4. Passing `.SD` to `frankv()` with `ties.method='random'` or with `na.last=NA` failed with `.SD is locked`, [#4429](https://github.com/Rdatatable/data.table/issues/4429). Thanks @smarches for the report. + +5. Filtering data.table using `which=NA` to return non-matching indices will now properly work for non-optimized subsetting as well, closes [#4411](https://github.com/Rdatatable/data.table/issues/4411). + +6. When `j` returns an object whose class `"X"` inherits from `data.table`; i.e. class `c("X", "data.table", "data.frame")`, the derived class `"X"` is no longer incorrectly dropped from the class of the `data.table` returned, [#4324](https://github.com/Rdatatable/data.table/issues/4324). Thanks to @HJAllen for reporting and @shrektan for the PR. + +7. `as.data.table()` failed with `.subset2(x, i, exact = exact): attempt to select less than one element in get1index` when passed an object inheriting from `data.table` with a different `[[` method, such as the class `dfidx` from the `dfidx` package, [#4526](https://github.com/Rdatatable/data.table/issues/4526). Thanks @RicoDiel for the report, and Michael Chirico for the PR. + +8. `rbind()` and `rbindlist()` of length-0 ordered factors failed with `Internal error: savetl_init checks failed`, [#4795](https://github.com/Rdatatable/data.table/issues/4795) [#4823](https://github.com/Rdatatable/data.table/issues/4823). Thanks to @shrektan and @dbart79 for reporting, and @shrektan for fixing. + +9. `data.table(NULL)[, firstCol:=1L]` created `data.table(firstCol=1L)` ok but did not update the internal `row.names` attribute, causing `Error in '$<-.data.frame'(x, name, value) : replacement has 1 row, data has 0` when passed to packages like `ggplot` which use `DT` as if it is a `data.frame`, [#4597](https://github.com/Rdatatable/data.table/issues/4597). Thanks to Matthew Son for reporting, and Cole Miller for the PR. + +10. `X[Y, .SD, by=]` (joining and grouping in the same query) could segfault if i) `by=` is supplied custom data (i.e. not simple expressions of columns), and ii) some rows of `Y` do not match to any rows in `X`, [#4892](https://github.com/Rdatatable/data.table/issues/4892). Thanks to @Kodiologist for reporting, @ColeMiller1 for investigating, and @tlapak for the PR. + +11. Assigning a set of 2 or more all-NA values to a factor column could segfault, [#4824](https://github.com/Rdatatable/data.table/issues/4824). Thanks to @clerousset for reporting and @shrektan for fixing. + +12. `as.data.table(table(NULL))` now returns `data.table(NULL)` rather than error `attempt to set an attribute on NULL`, [#4179](https://github.com/Rdatatable/data.table/issues/4179). The result differs slightly to `as.data.frame(table(NULL))` (0-row, 1-column) because 0-column works better with other `data.table` functions like `rbindlist()`. Thanks to Michael Chirico for the report and fix. + +13. `melt` with a list for `measure.vars` would output `variable` inconsistently between `na.rm=TRUE` and `FALSE`, [#4455](https://github.com/Rdatatable/data.table/issues/4455). Thanks to @tdhock for reporting and fixing. + +14. `by=...get()...` could fail with `object not found`, [#4873](https://github.com/Rdatatable/data.table/issues/4873) [#4981](https://github.com/Rdatatable/data.table/issues/4981). Thanks to @sindribaldur for reporting, and @OfekShilon for fixing. + +15. `print(x, col.names='none')` now removes the column names as intended for wide `data.table`s whose column names don't fit on a single line, [#4270](https://github.com/Rdatatable/data.table/issues/4270). Thanks to @tdhock for the report, and Michael Chirico for fixing. + +16. `DT[, min(colB), by=colA]` when `colB` is type `character` would miss blank strings (`""`) at the beginning of a group and return the smallest non-blank instead of blank, [#4848](https://github.com/Rdatatable/data.table/issues/4848). Thanks to Vadim Khotilovich for reporting and for the PR fixing it. + +17. Assigning a wrong-length or non-list vector to a list column could segfault, [#4166](https://github.com/Rdatatable/data.table/issues/4166) [#4667](https://github.com/Rdatatable/data.table/issues/4667) [#4678](https://github.com/Rdatatable/data.table/issues/4678) [#4729](https://github.com/Rdatatable/data.table/issues/4729). Thanks to @fklirono, Kun Ren, @kevinvzandvoort and @peterlittlejohn for reporting, and to Václav Tlapák for the PR. + +18. `as.data.table()` on `xts` objects containing a column named `x` would return an `index` of type plain `integer` rather than `POSIXct`, [#4897](https://github.com/Rdatatable/data.table/issues/4897). Thanks to Emil Sjørup for reporting, and Jan Gorecki for the PR. + +19. A fix to `as.Date(c("", ...))` in R 4.0.3, [17909](https://bugs.r-project.org/bugzilla3/show_bug.cgi?id=17909), has been backported to `data.table::as.IDate()` so that it too now returns `NA` for the first item when it is blank, even in older versions of R back to 3.1.0, rather than the incorrect error `character string is not in a standard unambiguous format`, [#4676](https://github.com/Rdatatable/data.table/issues/4676). Thanks to Arun Srinivasan for reporting, and Michael Chirico both for the `data.table` PR and for submitting the patch to R that was accepted and included in R 4.0.3. + +20. `uniqueN(DT, by=character())` is now equivalent to `uniqueN(DT)` rather than internal error `'by' is either not integer or is length 0`, [#4594](https://github.com/Rdatatable/data.table/issues/4594). Thanks Marco Colombo for the report, and Michael Chirico for the PR. Similarly for `unique()`, `duplicated()` and `anyDuplicated()`. + ## NOTES 1. New feature 29 in v1.12.4 (Oct 2019) introduced zero-copy coercion. Our thinking is that requiring you to get the type right in the case of `0` (type double) vs `0L` (type integer) is too inconvenient for you the user. So such coercions happen in `data.table` automatically without warning. Thanks to zero-copy coercion there is no speed penalty, even when calling `set()` many times in a loop, so there's no speed penalty to warn you about either. However, we believe that assigning a character value such as `"2"` into an integer column is more likely to be a user mistake that you would like to be warned about. The type difference (character vs integer) may be the only clue that you have selected the wrong column, or typed the wrong variable to be assigned to that column. For this reason we view character to numeric-like coercion differently and will warn about it. If it is correct, then the warning is intended to nudge you to wrap the RHS with `as.()` so that it is clear to readers of your code that a coercion from character to that type is intended. For example : @@ -25,6 +148,19 @@ nafill(x, fill=as.integer(3.14)) # no warning; the as. conveys intent ``` +2. `CsubsetDT` exported C function has been renamed to `DT_subsetDT`. This requires `R_GetCCallable("data.table", "CsubsetDT")` to be updated to `R_GetCCallable("data.table", "DT_subsetDT")`. Additionally there is now a dedicated header file for data.table C exports `include/datatableAPI.h`, [#4643](https://github.com/Rdatatable/data.table/issues/4643), thanks to @eddelbuettel, which makes it easier to _import_ data.table C functions. + +3. In v1.12.4, fractional `fread(..., stringsAsFactors=)` was added. For example if `stringsAsFactors=0.2`, any character column with fewer than 20% unique strings would be cast as `factor`. This is now documented in `?fread` as well, [#4706](https://github.com/Rdatatable/data.table/issues/4706). Thanks to @markderry for the PR. + +4. `cube(DT, by="a")` now gives a more helpful error that `j` is missing, [#4282](https://github.com/Rdatatable/data.table/pull/4282). + +5. v1.13.0 (July 2020) fixed a segfault/corruption/error (depending on version of R and circumstances) in `dcast()` when `fun.aggregate` returned `NA` (type `logical`) in an otherwise `character` result, [#2394](https://github.com/Rdatatable/data.table/issues/2394). This fix was the result of other internal rework and there was no news item at the time. A new test to cover this case has now been added. Thanks Vadim Khotilovich for reporting, and Michael Chirico for investigating, pinpointing when the fix occurred and adding the test. + +6. `DT[subset]` where `DT[(subset)]` or `DT[subset==TRUE]` was intended; i.e., subsetting by a logical column whose name conflicts with an existing function, now gives a friendlier error message, [#5014](https://github.com/Rdatatable/data.table/issues/5014). Thanks @michaelchirico for the suggestion and PR, and @ColeMiller1 for helping with the fix. + +7. Grouping by a `list` column has its error message improved stating this is unsupported, [#4308](https://github.com/Rdatatable/data.table/issues/4308). Thanks @sindribaldur for filing, and @michaelchirico for the PR. Please add your vote and especially use cases to the [#1597](https://github.com/Rdatatable/data.table/issues/1597) feature request. + + # data.table [v1.14.0](https://github.com/Rdatatable/data.table/milestone/23?closed=1) (21 Feb 2021) ## POTENTIALLY BREAKING CHANGES @@ -58,7 +194,7 @@ 2. `fwrite()`'s mutithreaded `gzip` compression failed on Solaris with Z_STREAM_ERROR, [#4099](https://github.com/Rdatatable/data.table/issues/4099). Since this feature was released in Oct 2019 (see item 3 in v1.12.4 below in this news file) there have been no known problems with it on Linux, Windows or Mac. For Solaris, we have been successively adding more and more detailed tracing to the output in each release, culminating in tracing `zlib` internals at byte level by reading `zlib`'s source. The problem did not manifest itself on [R-hub](https://builder.r-hub.io/)'s Solaris instances, so we had to work via CRAN output. If `zlib`'s `z_stream` structure is declared inside a parallel region but before a parallel for, it appears that the particular OpenMP implementation used by CRAN's Solaris moves the structure to a new address on entering the parallel for. Ordinarily this memory move would not matter, however, `zlib` internals have a self reference pointer to the parent, and check that the pointers match. This mismatch caused the -2 (Z_STREAM_ERROR). Allocating an array of structures, one for each thread, before the parallel region avoids the memory move with no cost. - It should be carefully noted that we cannot be sure it really is a problem unique to CRAN's Solaris. Even if it seems that way after one year of observations. For example, it could be compiler flags, or particular memory circumstances, either of which could occur on other operating systems too. However, we are unaware of why it would make sense for the OpenMP implementation to move the structure at that point. Any optimizations such as aligning the set of structures to cache line boundaries could be performed at the start of the parallel region, not after the parallel for. If anyone reading this knows more, please let us know. + It should be carefully noted that we cannot be sure it really is a problem unique to CRAN's Solaris. Even if it seems that way after one year of observations. For example, it could be compiler flags, or particular memory circumstances, either of which could occur on other operating systems too. However, we are unaware of why it would make sense for the OpenMP implementation to move the structure at that point. Any optimizations such as aligning the set of structures to cache line boundaries could be performed at the start of the parallel region, not after the parallel for. If anyone reading this knows more, please let us know. ## NOTES @@ -103,7 +239,7 @@ 1. `bit64` v4.0.2 and `bit` v4.0.3, both released on 30th July, correctly broke `data.table`'s tests. Like other packages on our `Suggest` list, we check `data.table` works with `bit64` in our tests. The first break was because `all.equal` always returned `TRUE` in previous versions of `bit64`. Now that `all.equal` works for `integer64`, the incorrect test comparison was revealed. If you use `bit64`, or `nanotime` which uses `bit64`, it is highly recommended to upgrade to the latest `bit64` version. Thanks to Cole Miller for the PR to accommodate `bit64`'s update. The second break caused by `bit` was the addition of a `copy` function. We did not ask, but the `bit` package kindly offered to change to a different name since `data.table::copy` is long standing. `bit` v4.0.4 released 4th August renamed `copy` to `copy_vector`. Otherwise, users of `data.table` would have needed to prefix every occurrence of `copy` with `data.table::copy` if they use `bit64` too, since `bit64` depends on (rather than importing) `bit`. Again, this impacted `data.table`'s tests which mimic a user's environment; not `data.table` itself per se. - + We have requested that CRAN policy be modified to require that reverse dependency testing include packages which `Suggest` the package. Had this been the case, reverse dependency testing of `bit64` would have caught the impact on `data.table` before release. 2. `?.NGRP` now displays the help page as intended, [#4946](https://github.com/Rdatatable/data.table/issues/4649). Thanks to @KyleHaynes for posting the issue, and Cole Miller for the fix. `.NGRP` is a symbol new in v1.13.0; see below in this file. @@ -122,7 +258,7 @@ has a better chance of working on Mac. 1. `fread` now supports native parsing of `%Y-%m-%d`, and [ISO 8601](https://en.wikipedia.org/wiki/ISO_8601) `%Y-%m-%dT%H:%M:%OS%z`, [#4464](https://github.com/Rdatatable/data.table/pull/4464). Dates are returned as `data.table`'s `integer`-backed `IDate` class (see `?IDate`), and datetimes are returned as `POSIXct` provided either `Z` or the offset from `UTC` is present; e.g. `fwrite()` outputs UTC by default including the final `Z`. Reminder that `IDate` inherits from R's `Date` and is identical other than it uses the `integer` type where (oddly) R uses the `double` type for dates (8 bytes instead of 4). `fread()` gains a `tz` argument to control datetime values that are missing a Z or UTC-offset (now referred to as *unmarked* datetimes); e.g. as written by `write.csv`. By default `tz=""` means, as in R, read the unmarked datetime in local time. Unless the timezone of the R session is UTC (e.g. the TZ environment variable is set to `"UTC"`, or `""` on non-Windows), unmarked datetime will then by read by `fread` as character, as before. If you have been using `colClasses="POSIXct"` that will still work using R's `as.POSIXct()` which will interpret the unmarked datetime in local time, as before, and still slowly. You can tell `fread` to read unmarked datetime as UTC, and quickly, by passing `tz="UTC"` which may be appropriate in many circumstances. Note that the default behaviour of R to read and write csv using unmarked datetime can lead to different research results when the csv file has been saved in one timezone and read in another due to observations being shifted to a different date. If you have been using `colClasses="POSIXct"` for UTC-marked datetime (e.g. as written by `fwrite` including the final `Z`) then it will automatically speed up with no changes needed. Since this is a potentially breaking change, i.e. existing code may depend on dates and datetimes being read as type character as before, a temporary option is provided to restore the old behaviour: `options(datatable.old.fread.datetime.character=TRUE)`. However, in most cases, we expect existing code to still work with no changes. - + The minor version number is bumped from 12 to 13, i.e. `v1.13.0`, where the `.0` conveys 'be-aware' as is common practice. As with any new feature, there may be bugs to fix and changes to defaults required in future. In addition to convenience, `fread` is now significantly faster in the presence of dates, UTC-marked datetimes, and unmarked datetime when tz="UTC" is provided. ## NEW FEATURES @@ -229,7 +365,6 @@ has a better chance of working on Mac. 11. `copy()` now overallocates deeply nested lists of `data.table`s, [#4205](https://github.com/Rdatatable/data.table/issues/4205). Thanks to @d-sci for reporting and the PR. 12. `rbindlist` no longer errors when coercing complex vectors to character vectors, [#4202](https://github.com/Rdatatable/data.table/issues/4202). Thanks to @sritchie73 for reporting and the PR. - 13. A relatively rare case of segfault when combining non-equi joins with `by=.EACHI` is now fixed, closes [#4388](https://github.com/Rdatatable/data.table/issues/4388). 14. Selecting key columns could incur a large speed penalty, [#4498](https://github.com/Rdatatable/data.table/issues/4498). Thanks to @Jesper on Stack Overflow for the report. @@ -244,7 +379,7 @@ has a better chance of working on Mac. 19. Matrices resulting from logical operators or comparisons on `data.table`s, e.g. in `dta == dtb`, can no longer have their colnames changed by reference later, [#4323](https://github.com/Rdatatable/data.table/issues/4323). Thanks to @eyherabh for reporting and @tlapak for the PR. -20. The environment variable `R_DATATABLE_NUM_THREADS` was being limited by `R_DATATABLE_NUM_PROCS_PERCENT` (by default 50%), [#4514](https://github.com/Rdatatable/data.table/issues/4514). It is now consistent with `setDTthreads()` and only limited by the full number of logical CPUs. For example, on a machine with 8 logical CPUs, `R_DATATABLE_NUM_THREADS=6` now results in 6 threads rather than 4 (50% of 8). +20. The environment variable `R_DATATABLE_NUM_THREADS` was being limited by `R_DATATABLE_NUM_PROCS_PERCENT` (by default 50%), [#4514](https://github.com/Rdatatable/data.table/issues/4514). It is now consistent with `setDTthreads()` and only limited by the full number of logical CPUs. For example, on a machine with 8 logical CPUs, `R_DATATABLE_NUM_THREADS=6` now results in 6 threads rather than 4 (50% of 8).r ## NOTES @@ -1484,4 +1619,3 @@ When `j` is a symbol (as in the quanteda and xgboost examples above) it will con # data.table v1.9.8 (Nov 2016) back to v1.2 (Aug 2008) has been moved to [NEWS.0.md](https://github.com/Rdatatable/data.table/blob/master/NEWS.0.md) - diff --git a/R/IDateTime.R b/R/IDateTime.R index 0c0be82e83..832424091f 100644 --- a/R/IDateTime.R +++ b/R/IDateTime.R @@ -7,6 +7,10 @@ as.IDate = function(x, ...) UseMethod("as.IDate") as.IDate.default = function(x, ..., tz = attr(x, "tzone", exact=TRUE)) { if (is.null(tz)) tz = "UTC" + if (is.character(x)) { + # backport of similar patch to base::as.Date.character in R 4.0.3, #4676 + is.na(x) = !nzchar(x) + } as.IDate(as.Date(x, tz = tz, ...)) } @@ -240,20 +244,20 @@ rep.ITime = function (x, ...) class(y) = "ITime" # unlass and rep could feasibly not copy, hence use class<- not setattr() y } - -round.ITime <- function(x, digits = c("hours", "minutes"), ...) + +round.ITime <- function(x, digits = c("hours", "minutes"), ...) { (setattr(switch(match.arg(digits), hours = as.integer(round(unclass(x)/3600)*3600), - minutes = as.integer(round(unclass(x)/60)*60)), + minutes = as.integer(round(unclass(x)/60)*60)), "class", "ITime")) -} +} -trunc.ITime <- function(x, units = c("hours", "minutes"), ...) +trunc.ITime <- function(x, units = c("hours", "minutes"), ...) { (setattr(switch(match.arg(units), hours = as.integer(unclass(x)%/%3600*3600), - minutes = as.integer(unclass(x)%/%60*60)), + minutes = as.integer(unclass(x)%/%60*60)), "class", "ITime")) } diff --git a/R/as.data.table.R b/R/as.data.table.R index 308a7b2ffe..75e8d23ae0 100644 --- a/R/as.data.table.R +++ b/R/as.data.table.R @@ -20,7 +20,7 @@ as.data.table.Date = as.data.table.ITime = function(x, keep.rownames=FALSE, key= tt = deparse(substitute(x))[1L] nm = names(x) # FR #2356 - transfer names of named vector as "rn" column if required - if (!identical(keep.rownames, FALSE) & !is.null(nm)) + if (!identical(keep.rownames, FALSE) && !is.null(nm)) x = list(nm, unname(x)) else x = list(x) if (tt == make.names(tt)) { @@ -33,6 +33,8 @@ as.data.table.Date = as.data.table.ITime = function(x, keep.rownames=FALSE, key= # as.data.table.table - FR #361 as.data.table.table = function(x, keep.rownames=FALSE, key=NULL, ...) { + # prevent #4179 & just cut out here + if (any(dim(x) == 0L)) return(null.data.table()) # Fix for bug #43 - order of columns are different when doing as.data.table(with(DT, table(x, y))) val = rev(dimnames(provideDimnames(x))) if (is.null(names(val)) || !any(nzchar(names(val)))) @@ -95,12 +97,12 @@ as.data.table.array = function(x, keep.rownames=FALSE, key=NULL, sorted=TRUE, va # NULL dimnames will create integer keys, not character as in table method val = if (is.null(dnx)) { lapply(dx, seq.int) - } else if (any(nulldnx<-sapply(dnx, is.null))) { + } else if (any(nulldnx <- vapply_1b(dnx, is.null))) { dnx[nulldnx] = lapply(dx[nulldnx], seq.int) #3636 dnx } else dnx val = rev(val) - if (is.null(names(val)) || all(!nzchar(names(val)))) + if (is.null(names(val)) || !any(nzchar(names(val)))) setattr(val, 'names', paste0("V", rev(seq_along(val)))) if (value.name %chin% names(val)) stop("Argument 'value.name' should not overlap with column names in result: ", brackify(rev(names(val)))) @@ -129,6 +131,7 @@ as.data.table.list = function(x, eachncol = integer(n) missing.check.names = missing(check.names) origListNames = if (missing(.named)) names(x) else NULL # as.data.table called directly, not from inside data.table() which provides .named, #3854 + empty_atomic = FALSE for (i in seq_len(n)) { xi = x[[i]] if (is.null(xi)) next # eachncol already initialized to 0 by integer() above @@ -148,10 +151,13 @@ as.data.table.list = function(x, } eachnrow[i] = NROW(xi) # for a vector (including list() columns) returns the length eachncol[i] = NCOL(xi) # for a vector returns 1 + if (is.atomic(xi) && length(xi)==0L && !is.null(xi)) { + empty_atomic = TRUE # any empty atomic (not empty list()) should result in nrows=0L, #3727 + } } ncol = sum(eachncol) # hence removes NULL items silently (no error or warning), #842. if (ncol==0L) return(null.data.table()) - nrow = max(eachnrow) + nrow = if (empty_atomic) 0L else max(eachnrow) ans = vector("list",ncol) # always return a new VECSXP recycle = function(x, nrow) { if (length(x)==nrow) { @@ -173,8 +179,6 @@ as.data.table.list = function(x, if (is.null(xi)) { n_null = n_null+1L; next } if (eachnrow[i]>1L && nrow%%eachnrow[i]!=0L) # in future: eachnrow[i]!=nrow warning("Item ", i, " has ", eachnrow[i], " rows but longest item has ", nrow, "; recycled with remainder.") - if (eachnrow[i]==0L && nrow>0L && is.atomic(xi)) # is.atomic to ignore list() since list() is a common way to initialize; let's not insist on list(NULL) - warning("Item ", i, " has 0 rows but longest item has ", nrow, "; filled with NA") # the rep() in recycle() above creates the NA vector if (is.data.table(xi)) { # matrix and data.frame were coerced to data.table above prefix = if (!isFALSE(.named[i]) && isTRUE(nchar(names(x)[i])>0L)) paste0(names(x)[i],".") else "" # test 2058.12 for (j in seq_along(xi)) { @@ -219,7 +223,8 @@ as.data.table.data.frame = function(x, keep.rownames=FALSE, key=NULL, ...) { } if (any(vapply_1i(x, function(xi) length(dim(xi))))) { # not is.atomic because is.atomic(matrix) is true # a data.frame with a column that is data.frame needs to be expanded; test 2013.4 - return(as.data.table.list(x, keep.rownames=keep.rownames, ...)) + # x may be a class with [[ method that behaves differently, so as.list first for default [[, #4526 + return(as.data.table.list(as.list(x), keep.rownames=keep.rownames, ...)) } ans = copy(x) # TO DO: change this deep copy to be shallow. setattr(ans, "row.names", .set_row_names(nrow(x))) diff --git a/R/between.R b/R/between.R index f5a6600da6..61fee332b4 100644 --- a/R/between.R +++ b/R/between.R @@ -44,11 +44,11 @@ between = function(x, lower, upper, incbounds=TRUE, NAbounds=TRUE, check=FALSE) # length(upper) can be 1 or length(x) independently of lower .Call(Cbetween, x, lower, upper, incbounds, NAbounds, check) } else { - if (isTRUE(getOption("datatable.verbose"))) cat("optimised between not available for this data type, fallback to slow R routine\n") + if (isTRUE(getOption("datatable.verbose"))) catf("optimised between not available for this data type, fallback to slow R routine\n") if (isTRUE(NAbounds) && (anyNA(lower) || anyNA(upper))) stop("Not yet implemented NAbounds=TRUE for this non-numeric and non-character type") if (check && any(lower>upper, na.rm=TRUE)) stop("Some lower>upper for this non-numeric and non-character type") - if (incbounds) x>=lower & x<=upper - else x>lower & x=lower & x<=upper # this & is correct not && + else x> lower & x< upper } } @@ -78,7 +78,7 @@ inrange = function(x,lower,upper,incbounds=TRUE) { subject = setDT(list(l=lower, u=upper)) ops = if (incbounds) c(4L, 2L) else c(5L, 3L) # >=,<= and >,< verbose = isTRUE(getOption("datatable.verbose")) - if (verbose) {last.started.at=proc.time();cat("forderv(query) took ... ");flush.console()} + if (verbose) {last.started.at=proc.time();catf("forderv(query) took ... ");flush.console()} if (verbose) {cat(timetaken(last.started.at),"\n"); flush.console()} ans = bmerge(shallow(subject), query, 1L:2L, c(1L,1L), 0, c(FALSE, TRUE), 0L, "all", ops, verbose) # fix for #1819, turn on verbose messages @@ -86,9 +86,9 @@ inrange = function(x,lower,upper,incbounds=TRUE) { options(datatable.verbose=FALSE) setDT(ans[c("starts", "lens")], key=c("starts", "lens")) options(datatable.verbose=verbose) - if (verbose) {last.started.at=proc.time();cat("Generating final logical vector ... ");flush.console()} + if (verbose) {last.started.at=proc.time();catf("Generating final logical vector ... ");flush.console()} .Call(Cinrange, idx <- vector("logical", length(x)), xo, ans[["starts"]], ans[["lens"]]) - if (verbose) {cat("done in",timetaken(last.started.at),"\n"); flush.console} + if (verbose) {catf("done in %s\n",timetaken(last.started.at)); flush.console} idx } diff --git a/R/bmerge.R b/R/bmerge.R index 3d6ab028f3..6bafd0e5bc 100644 --- a/R/bmerge.R +++ b/R/bmerge.R @@ -43,23 +43,25 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos xc = xcols[a] xclass = getClass(x[[xc]]) iclass = getClass(i[[ic]]) + xname = paste0("x.", names(x)[xc]) + iname = paste0("i.", names(i)[ic]) if (!xclass %chin% supported) stop("x.", names(x)[xc]," is type ", xclass, " which is not supported by data.table join") if (!iclass %chin% supported) stop("i.", names(i)[ic]," is type ", iclass, " which is not supported by data.table join") if (xclass=="factor" || iclass=="factor") { if (roll!=0.0 && a==length(icols)) stop("Attempting roll join on factor column when joining x.",names(x)[xc]," to i.",names(i)[ic],". Only integer, double or character columns may be roll joined.") if (xclass=="factor" && iclass=="factor") { - if (verbose) cat("Matching i.",names(i)[ic]," factor levels to x.",names(x)[xc]," factor levels.\n",sep="") + if (verbose) catf("Matching %s factor levels to %s factor levels.\n", iname, xname) set(i, j=ic, value=chmatch(levels(i[[ic]]), levels(x[[xc]]), nomatch=0L)[i[[ic]]]) # nomatch=0L otherwise a level that is missing would match to NA values next } else { if (xclass=="character") { - if (verbose) cat("Coercing factor column i.",names(i)[ic]," to type character to match type of x.",names(x)[xc],".\n",sep="") + if (verbose) catf("Coercing factor column %s to type character to match type of %s.\n", iname, xname) set(i, j=ic, value=val<-as.character(i[[ic]])) set(callersi, j=ic, value=val) # factor in i joining to character in x will return character and not keep x's factor; e.g. for antaresRead #3581 next } else if (iclass=="character") { - if (verbose) cat("Matching character column i.",names(i)[ic]," to factor levels in x.",names(x)[xc],".\n",sep="") + if (verbose) catf("Matching character column %s to factor levels in %s.\n", iname, xname) newvalue = chmatch(i[[ic]], levels(x[[xc]]), nomatch=0L) if (anyNA(i[[ic]])) newvalue[is.na(i[[ic]])] = NA_integer_ # NA_character_ should match to NA in factor, #3809 set(i, j=ic, value=newvalue) @@ -69,29 +71,29 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos stop("Incompatible join types: x.", names(x)[xc], " (",xclass,") and i.", names(i)[ic], " (",iclass,"). Factor columns must join to factor or character columns.") } if (xclass == iclass) { - if (verbose) cat("i.",names(i)[ic]," has same type (",xclass,") as x.",names(x)[xc],". No coercion needed.\n", sep="") + if (verbose) catf("%s has same type (%s) as %s. No coercion needed.\n", iname, xclass, xname) next } if (xclass=="character" || iclass=="character" || xclass=="logical" || iclass=="logical" || xclass=="factor" || iclass=="factor") { if (anyNA(i[[ic]]) && allNA(i[[ic]])) { - if (verbose) cat("Coercing all-NA i.",names(i)[ic]," (",iclass,") to type ",xclass," to match type of x.",names(x)[xc],".\n",sep="") + if (verbose) catf("Coercing all-NA %s (%s) to type %s to match type of %s.\n", iname, iclass, xclass, xname) set(i, j=ic, value=match.fun(paste0("as.", xclass))(i[[ic]])) next } else if (anyNA(x[[xc]]) && allNA(x[[xc]])) { - if (verbose) cat("Coercing all-NA x.",names(x)[xc]," (",xclass,") to type ",iclass," to match type of i.",names(i)[ic],".\n",sep="") + if (verbose) catf("Coercing all-NA %s (%s) to type %s to match type of %s.\n", xname, xclass, iclass, iname) set(x, j=xc, value=match.fun(paste0("as.", iclass))(x[[xc]])) next } stop("Incompatible join types: x.", names(x)[xc], " (",xclass,") and i.", names(i)[ic], " (",iclass,")") } if (xclass=="integer64" || iclass=="integer64") { - nm = paste0(c("i.","x."), c(names(i)[ic], names(x)[xc])) + nm = c(iname, xname) if (xclass=="integer64") { w=i; wc=ic; wclass=iclass; } else { w=x; wc=xc; wclass=xclass; nm=rev(nm) } # w is which to coerce if (wclass=="integer" || (wclass=="double" && !isReallyReal(w[[wc]]))) { - if (verbose) cat("Coercing ",wclass," column ", nm[1L], if(wclass=="double")" (which contains no fractions)"," to type integer64 to match type of ", nm[2L],".\n",sep="") + if (verbose) catf("Coercing %s column %s%s to type integer64 to match type of %s.\n", wclass, nm[1L], if (wclass=="double") " (which contains no fractions)" else "", nm[2L]) set(w, j=wc, value=bit64::as.integer64(w[[wc]])) } else stop("Incompatible join types: ", nm[2L], " is type integer64 but ", nm[1L], " is type double and contains fractions") } else { @@ -100,17 +102,17 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos if (!isReallyReal(i[[ic]])) { # common case of ad hoc user-typed integers missing L postfix joining to correct integer keys # we've always coerced to int and returned int, for convenience. - if (verbose) cat("Coercing double column i.",names(i)[ic]," (which contains no fractions) to type integer to match type of x.",names(x)[xc],".\n",sep="") + if (verbose) catf("Coercing double column %s (which contains no fractions) to type integer to match type of %s", iname, xname) val = as.integer(i[[ic]]) if (!is.null(attributes(i[[ic]]))) attributes(val) = attributes(i[[ic]]) # to retain Date for example; 3679 set(i, j=ic, value=val) set(callersi, j=ic, value=val) # change the shallow copy of i up in [.data.table to reflect in the result, too. } else { - if (verbose) cat("Coercing integer column x.",names(x)[xc]," to type double to match type of i.",names(i)[ic]," which contains fractions.\n",sep="") + if (verbose) catf("Coercing integer column %s to type double to match type of %s which contains fractions.\n", xname, iname) set(x, j=xc, value=as.double(x[[xc]])) } } else { - if (verbose) cat("Coercing integer column i.",names(i)[ic]," to type double for join to match type of x.",names(x)[xc],".\n",sep="") + if (verbose) catf("Coercing integer column %s to type double for join to match type of %s.\n", iname, xname) set(i, j=ic, value=as.double(i[[ic]])) } } @@ -126,17 +128,17 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos # equi join. use existing key (#1825) or existing secondary index (#1439) if (identical(xcols, head(chmatch(key(x), names(x)), length(xcols)))) { xo = integer(0L) - if (verbose) cat("on= matches existing key, using key\n") + if (verbose) catf("on= matches existing key, using key\n") } else { xo = NULL if (isTRUE(getOption("datatable.use.index"))) { xo = getindex(x, names(x)[xcols]) - if (verbose && !is.null(xo)) cat("on= matches existing index, using index\n") + if (verbose && !is.null(xo)) catf("on= matches existing index, using index\n") } if (is.null(xo)) { if (verbose) {last.started.at=proc.time(); flush.console()} xo = forderv(x, by = xcols) - if (verbose) {cat("Calculated ad hoc index in",timetaken(last.started.at),"\n"); flush.console()} + if (verbose) {catf("Calculated ad hoc index in %s\n", timetaken(last.started.at)); flush.console()} # TODO: use setindex() instead, so it's cached for future reuse } } @@ -147,9 +149,9 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos # non-equi operators present.. investigate groups.. nqgrp = integer(0L) nqmaxgrp = 1L - if (verbose) cat("Non-equi join operators detected ... \n") + if (verbose) catf("Non-equi join operators detected ... \n") if (roll != FALSE) stop("roll is not implemented for non-equi joins yet.") - if (verbose) {last.started.at=proc.time();cat(" forder took ... ");flush.console()} + if (verbose) {last.started.at=proc.time();catf(" forder took ... ");flush.console()} # TODO: could check/reuse secondary indices, but we need 'starts' attribute as well! xo = forderv(x, xcols, retGrp=TRUE) if (verbose) {cat(timetaken(last.started.at),"\n"); flush.console()} @@ -158,28 +160,28 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos if (length(resetcols)) { # TODO: can we get around having to reorder twice here? # or at least reuse previous order? - if (verbose) {last.started.at=proc.time();cat(" Generating group lengths ... ");flush.console()} + if (verbose) {last.started.at=proc.time();catf(" Generating group lengths ... ");flush.console()} resetlen = attr(forderv(x, resetcols, retGrp=TRUE), 'starts', exact=TRUE) resetlen = .Call(Cuniqlengths, resetlen, nrow(x)) - if (verbose) {cat("done in",timetaken(last.started.at),"\n"); flush.console()} + if (verbose) {catf("done in %s\n",timetaken(last.started.at)); flush.console()} } else resetlen = integer(0L) - if (verbose) {last.started.at=proc.time();cat(" Generating non-equi group ids ... ");flush.console()} + if (verbose) {last.started.at=proc.time();catf(" Generating non-equi group ids ... ");flush.console()} nqgrp = .Call(Cnestedid, x, xcols[non_equi:length(xcols)], xo, xg, resetlen, mult) - if (verbose) {cat("done in",timetaken(last.started.at),"\n"); flush.console()} + if (verbose) {catf("done in %s\n",timetaken(last.started.at)); flush.console()} if (length(nqgrp)) nqmaxgrp = max(nqgrp) # fix for #1986, when 'x' is 0-row table max(.) returns -Inf. if (nqmaxgrp > 1L) { # got some non-equi join work to do if ("_nqgrp_" %in% names(x)) stop("Column name '_nqgrp_' is reserved for non-equi joins.") - if (verbose) {last.started.at=proc.time();cat(" Recomputing forder with non-equi ids ... ");flush.console()} + if (verbose) {last.started.at=proc.time();catf(" Recomputing forder with non-equi ids ... ");flush.console()} set(nqx<-shallow(x), j="_nqgrp_", value=nqgrp) xo = forderv(nqx, c(ncol(nqx), xcols)) - if (verbose) {cat("done in",timetaken(last.started.at),"\n"); flush.console()} + if (verbose) {catf("done in %s\n",timetaken(last.started.at)); flush.console()} } else nqgrp = integer(0L) - if (verbose) cat(" Found", nqmaxgrp, "non-equi group(s) ...\n") + if (verbose) catf(" Found %d non-equi group(s) ...\n", nqmaxgrp) } - if (verbose) {last.started.at=proc.time();cat("Starting bmerge ...\n");flush.console()} + if (verbose) {last.started.at=proc.time();catf("Starting bmerge ...\n");flush.console()} ans = .Call(Cbmerge, i, x, as.integer(icols), as.integer(xcols), io, xo, roll, rollends, nomatch, mult, ops, nqgrp, nqmaxgrp) - if (verbose) {cat("bmerge done in",timetaken(last.started.at),"\n"); flush.console()} + if (verbose) {catf("bmerge done in %s\n",timetaken(last.started.at)); flush.console()} # TO DO: xo could be moved inside Cbmerge ans$xo = xo # for further use by [.data.table diff --git a/R/cedta.R b/R/cedta.R index 262db0a105..7ace210079 100644 --- a/R/cedta.R +++ b/R/cedta.R @@ -32,15 +32,15 @@ cedta = function(n=2L) { "data.table" %chin% names(getNamespaceImports(ns)) || # most common and recommended cases first for speed (nsname=="utils" && (exists("debugger.look", parent.frame(n+1L)) || - (length(sc<-sys.calls())>=8L && sc[[length(sc)-7L]][[1L]]=='example')) ) || # 'example' for #2972 + (length(sc<-sys.calls())>=8L && sc[[length(sc)-7L]] %iscall% 'example')) ) || # 'example' for #2972 (nsname=="base" && all(c("FUN", "X") %chin% ls(parent.frame(n)))) || # lapply - (nsname %chin% cedta.pkgEvalsUserCode && any(sapply(sys.calls(), function(x) is.name(x[[1L]]) && (x[[1L]]=="eval" || x[[1L]]=="evalq")))) || + (nsname %chin% cedta.pkgEvalsUserCode && any(vapply_1b(sys.calls(), function(x) is.name(x[[1L]]) && (x[[1L]]=="eval" || x[[1L]]=="evalq")))) || nsname %chin% cedta.override || isTRUE(ns$.datatable.aware) || # As of Sep 2018: RCAS, caretEnsemble, dtplyr, rstanarm, rbokeh, CEMiTool, rqdatatable, RImmPort, BPRMeth, rlist tryCatch("data.table" %chin% get(".Depends",paste("package",nsname,sep=":"),inherits=FALSE),error=function(e)FALSE) # both ns$.Depends and get(.Depends,ns) are not sufficient if (!ans && getOption("datatable.verbose")) { # nocov start - cat("cedta decided '",nsname,"' wasn't data.table aware. Here is call stack with [[1L]] applied:\n",sep="") + catf("cedta decided '%s' wasn't data.table aware. Here is call stack with [[1L]] applied:\n", nsname) print(sapply(sys.calls(), "[[", 1L)) # nocov end # so we can trace the namespace name that may need to be added (very unusually) diff --git a/R/data.table.R b/R/data.table.R index b8b2b4bf04..204eef6272 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -108,24 +108,37 @@ replace_dot_alias = function(e) { } .checkTypos = function(err, ref) { - if (grepl('object.*not found', err$message)) { - used = gsub(".*object '([^']+)'.*", "\\1", err$message) + # a slightly wonky workaround so that this still works in non-English sessions, #4989 + # generate this at run time (as opposed to e.g. onAttach) since session language is + # technically OK to update (though this should be rare), and since it's low-cost + # to do so here because we're about to error anyway. + missing_obj_fmt = gsub( + "'missing_datatable_variable____'", + "'(?[^']+)'", + tryCatch(eval(parse(text="missing_datatable_variable____")), error=identity)$message + # eval(parse()) to avoid "no visible binding for global variable" note from R CMD check + # names starting with _ don't parse, so no leading _ in the name + ) + idx <- regexpr(missing_obj_fmt, err$message, perl=TRUE) + if (idx > 0L) { + start = attr(idx, "capture.start", exact=TRUE)[ , "obj_name"] + used = substr( + err$message, + start, + start + attr(idx, "capture.length", exact=TRUE)[ , "obj_name"] - 1L + ) found = agrep(used, ref, value=TRUE, ignore.case=TRUE, fixed=TRUE) if (length(found)) { - stop("Object '", used, "' not found. Perhaps you intended ", - paste(head(found, 5L), collapse=", "), - if (length(found)<=5L) "" else paste(" or",length(found)-5L, "more")) + stop("Object '", used, "' not found. Perhaps you intended ", brackify(found)) } else { - stop("Object '", used, "' not found amongst ", - paste(head(ref, 5L), collapse=', '), - if (length(ref)<=5L) "" else paste(" and", length(ref)-5L, "more")) + stop("Object '", used, "' not found amongst ", brackify(ref)) } } else { stop(err$message, call.=FALSE) } } -"[.data.table" = function (x, i, j, by, keyby, with=TRUE, nomatch=getOption("datatable.nomatch", NA), mult="all", roll=FALSE, rollends=if (roll=="nearest") c(TRUE,TRUE) else if (roll>=0) c(FALSE,TRUE) else c(TRUE,FALSE), which=FALSE, .SDcols, verbose=getOption("datatable.verbose"), allow.cartesian=getOption("datatable.allow.cartesian"), drop=NULL, on=NULL) +"[.data.table" = function (x, i, j, by, keyby, with=TRUE, nomatch=getOption("datatable.nomatch", NA), mult="all", roll=FALSE, rollends=if (roll=="nearest") c(TRUE,TRUE) else if (roll>=0) c(FALSE,TRUE) else c(TRUE,FALSE), which=FALSE, .SDcols, verbose=getOption("datatable.verbose"), allow.cartesian=getOption("datatable.allow.cartesian"), drop=NULL, on=NULL, env=NULL) { # ..selfcount <<- ..selfcount+1 # in dev, we check no self calls, each of which doubles overhead, or could # test explicitly if the caller is [.data.table (even stronger test. TO DO.) @@ -137,7 +150,7 @@ replace_dot_alias = function(e) { else if (missing(drop)) `[.data.frame`(x,i,j) else `[.data.frame`(x,i,j,drop) # added is.data.table(ans) check to fix bug #81 - if (!missing(i) & is.data.table(ans)) setkey(ans,NULL) # See test 304 + if (!missing(i) && is.data.table(ans)) setkey(ans, NULL) # See test 304 return(ans) } if (!missing(verbose)) { @@ -149,16 +162,25 @@ replace_dot_alias = function(e) { } .global$print="" missingby = missing(by) && missing(keyby) # for tests 359 & 590 where passing by=NULL results in data.table not vector - if (!missing(keyby)) { - if (!missing(by)) stop("Provide either by= or keyby= but not both") - if (missing(j)) { warning("Ignoring keyby= because j= is not supplied"); keyby=NULL; } - by=bysub=substitute(keyby) - keyby=TRUE - # Assign to 'by' so that by is no longer missing and we can proceed as if there were one by + if (missingby || missing(j)) { + if (!missingby) warning("Ignoring by/keyby because 'j' is not supplied") + by = bysub = NULL + keyby = FALSE } else { - if (!missing(by) && missing(j)) { warning("Ignoring by= because j= is not supplied"); by=NULL; } - by=bysub= if (missing(by)) NULL else substitute(by) - keyby=FALSE + if (missing(by)) { + by = bysub = if (is.null(env)) substitute(keyby) + else eval(substitute(substitute2(.keyby, env), list(.keyby = substitute(keyby)))) + keyby = TRUE + } else { + by = bysub = if (is.null(env)) substitute(by) + else eval(substitute(substitute2(.by, env), list(.by = substitute(by)))) + if (missing(keyby)) + keyby = FALSE + else if (!isTRUEorFALSE(keyby)) + stop("When by and keyby are both provided, keyby must be TRUE or FALSE") + } + if (missing(by)) { missingby=TRUE; by=bysub=NULL } # possible when env is used, PR#4304 + else if (verbose) cat("Argument 'by' after substitute: ", paste(deparse(bysub, width.cutoff=500L), collapse=" "), "\n", sep="") } bynull = !missingby && is.null(by) #3530 byjoin = !is.null(by) && is.symbol(bysub) && bysub==".EACHI" @@ -214,11 +236,20 @@ replace_dot_alias = function(e) { av = NULL jsub = NULL if (!missing(j)) { - jsub = replace_dot_alias(substitute(j)) + if (is.null(env)) jsub = substitute(j) else { + jsub = eval(substitute( + substitute2(.j, env), + list(.j = substitute(j)) + )) + if (missing(jsub)) {j = substitute(); jsub=NULL} else if (verbose) cat("Argument 'j' after substitute: ", paste(deparse(jsub, width.cutoff=500L), collapse=" "), "\n", sep="") + } + } + if (!missing(j)) { + jsub = replace_dot_alias(jsub) root = if (is.call(jsub)) as.character(jsub[[1L]])[1L] else "" if (root == ":" || (root %chin% c("-","!") && jsub[[2L]] %iscall% '(' && jsub[[2L]][[2L]] %iscall% ':') || - ( (!length(av<-all.vars(jsub)) || all(substring(av,1L,2L)=="..")) && + ( (!length(av<-all.vars(jsub)) || all(startsWith(av, ".."))) && root %chin% c("","c","paste","paste0","-","!") && missingby )) { # test 763. TODO: likely that !missingby iff with==TRUE (so, with can be removed) # When no variable names (i.e. symbols) occur in j, scope doesn't matter because there are no symbols to find. @@ -235,8 +266,8 @@ replace_dot_alias = function(e) { with=FALSE if (length(av)) { for (..name in av) { - name = substring(..name, 3L) - if (name=="") stop("The symbol .. is invalid. The .. prefix must be followed by at least one character.") + name = substr(..name, 3L, nchar(..name)) + if (!nzchar(name)) stop("The symbol .. is invalid. The .. prefix must be followed by at least one character.") if (!exists(name, where=parent.frame())) { stop("Variable '",name,"' is not found in calling scope. Looking in calling scope because you used the .. prefix.", if (exists(..name, where=parent.frame())) @@ -252,7 +283,7 @@ replace_dot_alias = function(e) { ..syms = av } } else if (is.name(jsub)) { - if (substring(jsub, 1L, 2L) == "..") stop("Internal error: DT[, ..var] should be dealt with by the branch above now.") # nocov + if (startsWith(as.character(jsub), "..")) stop("Internal error: DT[, ..var] should be dealt with by the branch above now.") # nocov if (!with && !exists(as.character(jsub), where=parent.frame())) stop("Variable '",jsub,"' is not found in calling scope. Looking in calling scope because you set with=FALSE. Also, please use .. symbol prefix and remove with=FALSE.") } @@ -290,10 +321,18 @@ replace_dot_alias = function(e) { # setdiff removes duplicate entries, which'll create issues with duplicated names. Use %chin% instead. dupdiff = function(x, y) x[!x %chin% y] - + isub = NULL + if (!missing(i)) { + if (is.null(env)) isub = substitute(i) else { + isub = eval(substitute( + substitute2(.i, env), + list(.i = substitute(i)) + )) + if (missing(isub)) {i = substitute(); isub=NULL} else if (verbose) cat("Argument 'i' after substitute: ", paste(deparse(isub, width.cutoff=500L), collapse=" "), "\n", sep="") + } + } if (!missing(i)) { xo = NULL - isub = substitute(i) if (identical(isub, NA)) { # only possibility *isub* can be NA (logical) is the symbol NA itself; i.e. DT[NA] # replace NA in this case with NA_integer_ as that's almost surely what user intended to @@ -366,14 +405,17 @@ replace_dot_alias = function(e) { } else { # isub is a single symbol name such as B in DT[B] i = try(eval(isub, parent.frame(), parent.frame()), silent=TRUE) - if (inherits(i,"try-error")) { + if (inherits(i,"try-error") || is.function(i)) { # must be "not found" since isub is a mere symbol col = try(eval(isub, x), silent=TRUE) # is it a column name? - msg = if (inherits(col,"try-error")) " and it is not a column name either." - else paste0(" but it is a column of type ", typeof(col),". If you wish to select rows where that column contains TRUE", - ", or perhaps that column contains row numbers of itself to select, try DT[(col)], DT[DT$col], or DT[col==TRUE] is particularly clear and is optimized.") - stop(as.character(isub), " is not found in calling scope", msg, - " When the first argument inside DT[...] is a single symbol (e.g. DT[var]), data.table looks for var in calling scope.") + msg = if (inherits(col, "try-error")) gettextf( + "'%s' is not found in calling scope and it is not a column name either. ", + as.character(isub) + ) else gettextf( + "'%s' is not found in calling scope, but it is a column of type %s. If you wish to select rows where that column contains TRUE, or perhaps that column contains row numbers of itself to select, try DT[(col)], DT[DT$col], or DT[col==TRUE} is particularly clear and is optimized. ", + as.character(isub), typeof(col) + ) + stop(msg, "When the first argument inside DT[...] is a single symbol (e.g. DT[var]), data.table looks for var in calling scope.") } } if (restore.N) { @@ -418,9 +460,11 @@ replace_dot_alias = function(e) { len_common_names = length(common_names) if (!len_common_names) stop("Attempting to do natural join but no common columns in provided tables") if (verbose) { - which_cols_msg = if (len_common_names == length(x)) " all 'x' columns" - else paste(":", brackify(common_names)) - cat("Joining but 'x' has no key, natural join using", which_cols_msg, "\n", sep = "") + which_cols_msg = if (len_common_names == length(x)) { + catf("Joining but 'x' has no key, natural join using all 'x' columns") + } else { + catf("Joining but 'x' has no key, natural join using: %s", brackify(common_names)) + } } on = common_names } @@ -448,10 +492,10 @@ replace_dot_alias = function(e) { # Implementation for not-join along with by=.EACHI, #604 if (notjoin && (byjoin || mult != "all")) { # mult != "all" needed for #1571 notjoin = FALSE - if (verbose) {last.started.at=proc.time();cat("not-join called with 'by=.EACHI'; Replacing !i with i=setdiff_(x,i) ...");flush.console()} + if (verbose) {last.started.at=proc.time();catf("not-join called with 'by=.EACHI'; Replacing !i with i=setdiff_(x,i) ...");flush.console()} orignames = copy(names(i)) i = setdiff_(x, i, rightcols, leftcols) # part of #547 - if (verbose) {cat("done in",timetaken(last.started.at),"\n"); flush.console()} + if (verbose) {catf("done in %s\n",timetaken(last.started.at)); flush.console()} setnames(i, orignames[leftcols]) setattr(i, 'sorted', names(i)) # since 'x' has key set, this'll always be sorted } @@ -479,7 +523,7 @@ replace_dot_alias = function(e) { if (!byjoin || nqbyjoin) { # Really, `anyDuplicated` in base is AWESOME! # allow.cartesian shouldn't error if a) not-join, b) 'i' has no duplicates - if (verbose) {last.started.at=proc.time();cat("Constructing irows for '!byjoin || nqbyjoin' ... ");flush.console()} + if (verbose) {last.started.at=proc.time();catf("Constructing irows for '!byjoin || nqbyjoin' ... ");flush.console()} irows = if (allLen1) f__ else vecseq(f__,len__, if (allow.cartesian || notjoin || # #698. When notjoin=TRUE, ignore allow.cartesian. Rows in answer will never be > nrow(x). @@ -493,7 +537,7 @@ replace_dot_alias = function(e) { if (identical(nomatch, 0L) && allLen1) irows = irows[irows != 0L] } else { if (length(xo) && missing(on)) - stop("Internal error. Cannot by=.EACHI when joining to a secondary key, yet") # nocov + stop("Internal error. Cannot by=.EACHI when joining to an index, yet") # nocov # since f__ refers to xo later in grouping, so xo needs to be passed through to dogroups too. if (length(irows)) stop("Internal error. irows has length in by=.EACHI") # nocov @@ -518,7 +562,7 @@ replace_dot_alias = function(e) { if (length(xo) && length(irows)) { irows = xo[irows] # TO DO: fsort here? if (mult=="all" && !allGrp1) { # following #1991 fix, !allGrp1 will always be TRUE. TODO: revisit. - if (verbose) {last.started.at=proc.time();cat("Reorder irows for 'mult==\"all\" && !allGrp1' ... ");flush.console()} + if (verbose) {last.started.at=proc.time();catf("Reorder irows for 'mult==\"all\" && !allGrp1' ... ");flush.console()} irows = setorder(setDT(list(indices=rep.int(indices__, len__), irows=irows)))[["irows"]] if (verbose) {cat(timetaken(last.started.at),"\n"); flush.console()} } @@ -530,13 +574,13 @@ replace_dot_alias = function(e) { ## restore original order. This is a very expensive operation. ## benchmarks have shown that starting with 1e6 irows, a tweak can significantly reduce time ## (see #2366) - if (verbose) {last.started.at=proc.time()[3L];cat("Reordering", length(irows), "rows after bmerge done in ... ");flush.console()} + if (verbose) {last.started.at=proc.time();catf("Reordering %d rows after bmerge done in ... ", length(irows));flush.console()} if(length(irows) < 1e6){ irows = fsort(irows, internal=TRUE) ## internally, fsort on integer falls back to forderv } else { irows = as.integer(fsort(as.numeric(irows))) ## nocov; parallelized for numeric, but overhead of type conversion } - if (verbose) {cat(round(proc.time()[3L]-last.started.at,3L),"secs\n");flush.console()} + if (verbose) {cat(timetaken(last.started.at), "\n");flush.console()} } ## make sure, all columns are taken from x and not from i. ## This is done by simply telling data.table to continue as if there was a simple subset @@ -553,6 +597,11 @@ replace_dot_alias = function(e) { # i is not a data.table if (!is.logical(i) && !is.numeric(i)) stop("i has evaluated to type ", typeof(i), ". Expecting logical, integer or double.") if (is.logical(i)) { + if (is.na(which)) { # #4411 i filter not optimized to join: DT[A > 1, which = NA] + ## we need this branch here, not below next to which=TRUE because irows=i=which(i) will filter out NAs: DT[A > 10, which = NA] will be incorrect + if (notjoin) stop("internal error: notjoin and which=NA (non-matches), huh? please provide reproducible example to issue tracker") # nocov + return(which(is.na(i) | !i)) + } if (length(i)==1L # to avoid unname copy when length(i)==nrow (normal case we don't want to slow down) && isTRUE(unname(i))) { irows=i=NULL } # unname() for #2152 - length 1 named logical vector. # NULL is efficient signal to avoid creating 1:nrow(x) but still return all rows, fixes #1249 @@ -582,9 +631,9 @@ replace_dot_alias = function(e) { if (notjoin) { if (byjoin || !is.integer(irows) || is.na(nomatch)) stop("Internal error: notjoin but byjoin or !integer or nomatch==NA") # nocov irows = irows[irows!=0L] - if (verbose) {last.started.at=proc.time()[3L];cat("Inverting irows for notjoin done in ... ");flush.console()} + if (verbose) {last.started.at=proc.time();catf("Inverting irows for notjoin done in ... ");flush.console()} i = irows = if (length(irows)) seq_len(nrow(x))[-irows] else NULL # NULL meaning all rows i.e. seq_len(nrow(x)) - if (verbose) cat(round(proc.time()[3L]-last.started.at, 3L), "sec\n") + if (verbose) cat(timetaken(last.started.at), "\n") leftcols = integer() # proceed as if row subset from now on, length(leftcols) is switched on later rightcols = integer() # Doing this once here, helps speed later when repeatedly subsetting each column. R's [irows] would do this for each @@ -660,7 +709,7 @@ replace_dot_alias = function(e) { j = eval(jsub, setattr(as.list(seq_along(x)), 'names', names_x), parent.frame()) # else j will be evaluated for the first time on next line } else { names(..syms) = ..syms - j = eval(jsub, lapply(substring(..syms,3L), get, pos=parent.frame()), parent.frame()) + j = eval(jsub, lapply(substr(..syms, 3L, nchar(..syms)), get, pos=parent.frame()), parent.frame()) } if (is.logical(j)) j <- which(j) if (!length(j) && !notj) return( null.data.table() ) @@ -681,7 +730,7 @@ replace_dot_alias = function(e) { if (!length(ansvals)) return(null.data.table()) if (!length(leftcols)) { if (!anyNA(ansvals)) return(.Call(CsubsetDT, x, irows, ansvals)) - else stop("column(s) not found: ", paste(ansvars[is.na(ansvals)],collapse=", ")) + else stop("column(s) not found: ", brackify(ansvars[is.na(ansvals)])) } # else the NA in ansvals are for join inherited scope (test 1973), and NA could be in irows from join and data in i should be returned (test 1977) # in both cases leave to the R-level subsetting of i and x together further below @@ -750,7 +799,12 @@ replace_dot_alias = function(e) { bysub = parse(text=paste0("list(",paste(bysub,collapse=","),")"))[[1L]] bysubl = as.list.default(bysub) } - allbyvars = intersect(all.vars(bysub), names_x) + if (any(c("eval","evalq","eval.parent","local","get","mget","dynGet") %chin% all.names(bysub))) + # when the 'by' expression includes get/mget/eval, all.vars cannot be trusted to infer all used columns, #4981 + allbyvars = NULL + else + allbyvars = intersect(all.vars(bysub), names_x) + orderedirows = .Call(CisOrderedSubset, irows, nrow(x)) # TRUE when irows is NULL (i.e. no i clause). Similar but better than is.sorted(f__) bysameorder = byindex = FALSE if (!bysub %iscall% ":" && ##Fix #4285 @@ -761,11 +815,11 @@ replace_dot_alias = function(e) { # TODO: could be allowed if length(irows)>1 but then the index would need to be squashed for use by uniqlist, #3062 # find if allbyvars is leading subset of any of the indices; add a trailing "__" to fix #3498 where a longer column name starts with a shorter column name tt = paste0(c(allbyvars,""), collapse="__") - w = which.first(substring(paste0(indices(x),"__"),1L,nchar(tt)) == tt) + w = which.first(startsWith(paste0(indices(x), "__"), tt)) if (!is.na(w)) { byindex = indices(x)[w] if (!length(getindex(x, byindex))) { - if (verbose) cat("by index '", byindex, "' but that index has 0 length. Ignoring.\n", sep="") + if (verbose) catf("by index '%s' but that index has 0 length. Ignoring.\n", byindex) byindex=FALSE } } @@ -788,10 +842,10 @@ replace_dot_alias = function(e) { # TO DO: Make xss directly, rather than recursive call. if (!is.na(nomatch)) irows = irows[irows!=0L] # TO DO: can be removed now we have CisSortedSubset if (length(allbyvars)) { ############### TO DO TO DO TO DO ############### - if (verbose) cat("i clause present and columns used in by detected, only these subset:",paste(allbyvars,collapse=","),"\n") + if (verbose) catf("i clause present and columns used in by detected, only these subset: %s\n", brackify(allbyvars)) xss = x[irows,allbyvars,with=FALSE,nomatch=nomatch,mult=mult,roll=roll,rollends=rollends] } else { - if (verbose) cat("i clause present but columns used in by not detected. Having to subset all columns before evaluating 'by': '",deparse(by),"'\n",sep="") + if (verbose) catf("i clause present but columns used in by not detected. Having to subset all columns before evaluating 'by': '%s'\n", deparse(by)) xss = x[irows,nomatch=nomatch,mult=mult,roll=roll,rollends=rollends] } if (bysub %iscall% ':' && length(bysub)==3L) { @@ -827,10 +881,12 @@ replace_dot_alias = function(e) { if (!is.list(byval)) stop("'by' or 'keyby' must evaluate to a vector or a list of vectors (where 'list' includes data.table and data.frame which are lists, too)") if (length(byval)==1L && is.null(byval[[1L]])) bynull=TRUE #3530 when by=(function()NULL)() if (!bynull) for (jj in seq_len(length(byval))) { - if (!typeof(byval[[jj]]) %chin% ORDERING_TYPES) stop("column or expression ",jj," of 'by' or 'keyby' is type ",typeof(byval[[jj]]),". Do not quote column names. Usage: DT[,sum(colC),by=list(colA,month(colB))]") + if (!(this_type <- typeof(byval[[jj]])) %chin% ORDERING_TYPES) { + stop(gettextf("Column or expression %d of 'by' or 'keyby' is type '%s' which is not currently supported. If you have a compelling use case, please add it to https://github.com/Rdatatable/data.table/issues/1597. As a workaround, consider converting the column to a supported type, e.g. by=sapply(list_col, toString), whilst taking care to maintain distinctness in the process.", jj, this_type)) + } } tt = vapply_1i(byval,length) - if (any(tt!=xnrow)) stop(gettextf("The items in the 'by' or 'keyby' list are length(s) (%s). Each must be length %d; the same length as there are rows in x (after subsetting if i is provided).", paste(tt, collapse=","), xnrow, domain='R-data.table')) + if (any(tt!=xnrow)) stop(domain=NA, gettextf("The items in the 'by' or 'keyby' list are length(s) (%s). Each must be length %d; the same length as there are rows in x (after subsetting if i is provided).", paste(tt, collapse=","), xnrow)) if (is.null(bynames)) bynames = rep.int("",length(byval)) if (length(idx <- which(!nzchar(bynames))) && !bynull) { # TODO: improve this and unify auto-naming of jsub and bysub @@ -849,14 +905,13 @@ replace_dot_alias = function(e) { if (length(byvars) > 1L && tt %chin% all.vars(jsub, FALSE)) { bynames[jj] = deparse(bysubl[[jj+1L]]) if (verbose) - cat("by-expression '", bynames[jj], "' is not named, and the auto-generated name '", tt, - "' clashed with variable(s) in j. Therefore assigning the entire by-expression as name.\n", sep="") + catf("by-expression '%s' is not named, and the auto-generated name '%s' clashed with variable(s) in j. Therefore assigning the entire by-expression as name.\n", bynames[jj], tt) } else bynames[jj] = tt # if user doesn't like this inferred name, user has to use by=list() to name the column } # Fix for #1334 - if (any(duplicated(bynames))) { + if (anyDuplicated(bynames)) { bynames = make.unique(bynames) } } @@ -866,8 +921,8 @@ replace_dot_alias = function(e) { jvnames = NULL drop_dot = function(x) { if (length(x)!=1L) stop("Internal error: drop_dot passed ",length(x)," items") # nocov - if (identical(substring(x<-as.character(x), 1L, 1L), ".") && x %chin% c(".N", ".I", ".GRP", ".NGRP", ".BY")) - substring(x, 2L) + if (startsWith(x<-as.character(x), ".") && x %chin% c(".N", ".I", ".GRP", ".NGRP", ".BY")) + substr(x, 2L, nchar(x)) else x } @@ -884,12 +939,16 @@ replace_dot_alias = function(e) { # attempt to auto-name unnamed columns for (jj in which(nm=="")) { thisq = q[[jj + 1L]] - if (missing(thisq)) stop(gettextf("Item %d of the .() or list() passed to j is missing", jj, domain="R-data.table")) #3507 + if (missing(thisq)) stop(domain=NA, gettextf("Item %d of the .() or list() passed to j is missing", jj)) #3507 if (is.name(thisq)) nm[jj] = drop_dot(thisq) # TO DO: if call to a[1] for example, then call it 'a' too } - if (!is.null(jvnames) && any(idx <- nm != jvnames)) - warning("Different branches of j expression produced different auto-named columns: ", brackify(sprintf('%s!=%s', nm[idx], jvnames[idx])), '; using the most "last" names', call. = FALSE) + if (!is.null(jvnames)) { + if (length(nm) != length(jvnames)) + warning("j may not evaluate to the same number of columns for each group; if you're sure this warning is in error, please put the branching logic outside of [ for efficiency") + else if (any(idx <- nm != jvnames)) + warning("Different branches of j expression produced different auto-named columns: ", brackify(sprintf('%s!=%s', nm[idx], jvnames[idx])), '; using the most "last" names. If this was intentional (e.g., you know only one branch will ever be used in a given query because the branch is controlled by a function argument), please (1) pull this branch out of the call; (2) explicitly provide missing defaults for each branch in all cases; or (3) use the same name for each branch and re-name it in a follow-up call.', call. = FALSE) + } jvnames <<- nm # TODO: handle if() list(a, b) else list(b, a) better setattr(q, "names", NULL) # drops the names from the list so it's faster to eval the j for each group; reinstated at the end on the result. } @@ -942,7 +1001,7 @@ replace_dot_alias = function(e) { } else { if (colsub %iscall% 'patterns') { # each pattern gives a new filter condition, intersect the end result - .SDcols = Reduce(intersect, do_patterns(colsub, names_x)) + .SDcols = Reduce(intersect, eval_with_cols(colsub, names_x)) } else { .SDcols = eval(colsub, parent.frame(), parent.frame()) # allow filtering via function in .SDcols, #3950 @@ -983,7 +1042,7 @@ replace_dot_alias = function(e) { # added 'mget' - fix for #994 if (any(c("get", "mget") %chin% av)){ if (verbose) - cat(gettextf("'(m)get' found in j. ansvars being set to all columns. Use .SDcols or a single j=eval(macro) instead. Both will detect the columns used which is important for efficiency.\nOld ansvars: %s \n", brackify(ansvars), domain = "R-data.table")) + cat(gettextf("'(m)get' found in j. ansvars being set to all columns. Use .SDcols or a single j=eval(macro) instead. Both will detect the columns used which is important for efficiency.\nOld ansvars: %s \n", brackify(ansvars))) # get('varname') is too difficult to detect which columns are used in general # eval(macro) column names are detected via the if jsub[[1]]==eval switch earlier above. @@ -1003,7 +1062,7 @@ replace_dot_alias = function(e) { } non_sdvars = setdiff(ansvars, sdvars) ansvals = chmatch(ansvars, names_x) - if (verbose) cat(gettextf("New ansvars: %s \n", brackify(ansvars), domain = "R-data.table")) + if (verbose) catf("New ansvars: %s \n", brackify(ansvars)) } else if (length(non_sdvars)) { # we've a situation like DT[, c(sum(V1), lapply(.SD, mean)), by=., .SDcols=...] or # DT[, lapply(.SD, function(x) x *v1), by=, .SDcols=...] etc., @@ -1015,7 +1074,7 @@ replace_dot_alias = function(e) { if (!missing(.SDcols)) warning("This j doesn't use .SD but .SDcols has been supplied. Ignoring .SDcols. See ?data.table.") allcols = c(names_x, xdotprefix, names_i, idotprefix) ansvars = sdvars = setdiff(intersect(av, allcols), bynames) - if (verbose) cat("Detected that j uses these columns:",if (!length(ansvars)) "" else paste(ansvars,collapse=","),"\n") + if (verbose) catf("Detected that j uses these columns: %s\n",if (!length(ansvars)) "" else brackify(ansvars)) # using a few named columns will be faster # Consider: DT[,max(diff(date)),by=list(month=month(date))] # and: DT[,lapply(.SD,sum),by=month(date)] @@ -1062,7 +1121,7 @@ replace_dot_alias = function(e) { lhs = names_x[m] } else stop("LHS of := isn't column names ('character') or positions ('integer' or 'numeric')") - if (all(!is.na(m))) { + if (!anyNA(m)) { # updates by reference to existing columns cols = as.integer(m) newnames=NULL @@ -1077,8 +1136,7 @@ replace_dot_alias = function(e) { # fix errors in their RHS when called on empty edge cases, even when the result won't be # used anyway (so it would be annoying to have to fix it.) if (verbose) { - cat("No rows match i. No new columns to add so not evaluating RHS of :=\n") - cat("Assigning to 0 row subset of",nrow(x),"rows\n") + catf("No rows match i. No new columns to add so not evaluating RHS of :=\nAssigning to 0 row subset of %d rows\n", nrow(x)) } .Call(Cassign, x, irows, NULL, NULL, NULL) # only purpose is to write 0 to .Last.updated .global$print = address(x) @@ -1100,9 +1158,9 @@ replace_dot_alias = function(e) { # i.e. reallocate at the size as if the new columns were added followed by setalloccol(). name = substitute(x) if (is.name(name) && ok && verbose) { # && NAMED(x)>0 (TO DO) # ok here includes -1 (loaded from disk) - cat("Growing vector of column pointers from truelength ", truelength(x), " to ", n, ". A shallow copy has been taken, see ?setalloccol. Only a potential issue if two variables point to the same data (we can't yet detect that well) and if not you can safely ignore this. To avoid this message you could setalloccol() first, deep copy first using copy(), wrap with suppressWarnings() or increase the 'datatable.alloccol' option.\n") + catf("Growing vector of column pointers from truelength %d to %d. A shallow copy has been taken, see ?setalloccol. Only a potential issue if two variables point to the same data (we can't yet detect that well) and if not you can safely ignore this. To avoid this message you could setalloccol() first, deep copy first using copy(), wrap with suppressWarnings() or increase the 'datatable.alloccol' option.\n", truelength(x), n) # #1729 -- copying to the wrong environment here can cause some confusion - if (ok == -1L) cat("Note that the shallow copy will assign to the environment from which := was called. That means for example that if := was called within a function, the original table may be unaffected.\n") + if (ok == -1L) catf("Note that the shallow copy will assign to the environment from which := was called. That means for example that if := was called within a function, the original table may be unaffected.\n") # Verbosity should not issue warnings, so cat rather than warning. # TO DO: Add option 'datatable.pedantic' to turn on warnings like this. @@ -1123,7 +1181,7 @@ replace_dot_alias = function(e) { if (is.list(k)) { origj = j = if (name[[1L]] == "$") as.character(name[[3L]]) else eval(name[[3L]], parent.frame(), parent.frame()) if (is.character(j)) { - if (length(j)!=1L) stop("Cannot assign to an under-allocated recursively indexed list -- L[[i]][,:=] syntax is only valid when i is length 1, but it's length ", length(j)) + if (length(j)!=1L) stop("Cannot assign to an under-allocated recursively indexed list -- L[[i]][,:=] syntax is only valid when i is length 1, but its length is ", length(j)) j = match(j, names(k)) if (is.na(j)) stop("Internal error -- item '", origj, "' not found in names of list") # nocov } @@ -1154,7 +1212,7 @@ replace_dot_alias = function(e) { xcolsAns = seq_along(ansvars) icols = icolsAns = integer() } else { - if (!length(leftcols)) stop("Internal error -- column(s) not found: ", paste(ansvars[wna],collapse=", ")) # nocov + if (!length(leftcols)) stop("Internal error -- column(s) not found: ", brackify(ansvars[wna])) # nocov xcols = w[!wna] xcolsAns = which(!wna) map = c(seq_along(i), leftcols) # this map is to handle dups in leftcols, #3635 @@ -1184,8 +1242,8 @@ replace_dot_alias = function(e) { } syms = all.vars(jsub) - syms = syms[ substring(syms,1L,2L)==".." ] - syms = syms[ substring(syms,3L,3L)!="." ] # exclude ellipsis + syms = syms[ startsWith(syms, "..") ] + syms = syms[ substr(syms, 3L, 3L) != "." ] # exclude ellipsis for (sym in syms) { if (sym %chin% names_x) { # if "..x" exists as column name, use column, for backwards compatibility; e.g. package socialmixr in rev dep checks #2779 @@ -1193,7 +1251,7 @@ replace_dot_alias = function(e) { # TODO in future, as warned in NEWS item for v1.11.0 : # warning(sym," in j is looking for ",getName," in calling scope, but a column '", sym, "' exists. Column names should not start with ..") } - getName = substring(sym, 3L) + getName = substr(sym, 3L, nchar(sym)) if (!exists(getName, parent.frame())) { if (exists(sym, parent.frame())) next # user did 'manual' prefix; i.e. variable in calling scope has .. prefix stop("Variable '",getName,"' is not found in calling scope. Looking in calling scope because this symbol was prefixed with .. in the j= parameter.") @@ -1335,15 +1393,20 @@ replace_dot_alias = function(e) { setattr(jval,"names",NULL) # discard names of named vectors otherwise each cell in the column would have a name jval = list(jval) } - if (!is.null(jvnames) && !all(jvnames=="")) setattr(jval, 'names', jvnames) # e.g. jvnames=="N" for DT[,.N,] + if (!is.null(jvnames) && any(nzchar(jvnames))) { + if (length(jvnames) > length(jval)) jvnames = jvnames[seq_along(jval)] #4274 + setattr(jval, 'names', jvnames[seq_along(jval)]) # e.g. jvnames=="N" for DT[,.N,] + } jval = as.data.table.list(jval, .named=NULL) } if (is.data.table(jval)) { - setattr(jval, 'class', class(x)) # fix for #64 + # should set the parent class only when jval is a plain data.table #4324 + if (identical(class(jval), c('data.table', 'data.frame'))) + setattr(jval, 'class', class(x)) # fix for #64 if (haskey(x) && all(key(x) %chin% names(jval)) && is.sorted(jval, by=key(x))) setattr(jval, 'sorted', key(x)) - if (any(sapply(jval, is.null))) stop("Internal error: j has created a data.table result containing a NULL column") # nocov + if (any(vapply_1b(jval, is.null))) stop("Internal error: j has created a data.table result containing a NULL column") # nocov } return(jval) } @@ -1371,7 +1434,7 @@ replace_dot_alias = function(e) { SDenv$`-.POSIXt` = function(e1, e2) { if (inherits(e2, 'POSIXt')) { if (verbose && !exists('done_units_report', parent.frame())) { - cat('\nNote: forcing units="secs" on implicit difftime by group; call difftime explicitly to choose custom units') + catf('\nNote: forcing units="secs" on implicit difftime by group; call difftime explicitly to choose custom units\n') assign('done_units_report', TRUE, parent.frame()) } return(difftime(e1, e2, units='secs')) @@ -1385,7 +1448,8 @@ replace_dot_alias = function(e) { byval = i bynames = if (missing(on)) head(key(x),length(leftcols)) else names(on) allbyvars = NULL - bysameorder = haskey(i) || (is.sorted(f__) && ((roll == FALSE) || length(f__) == 1L)) # Fix for #1010 + bysameorder = (haskey(i) && identical(leftcols, chmatch(head(key(i),length(leftcols)), names(i)))) || # leftcols leading subset of key(i); see #4917 + (roll==FALSE && is.sorted(f__)) # roll==FALSE is fix for #1010 ## 'av' correct here ?? *** TO DO *** xjisvars = intersect(av, names_x[rightcols]) # no "x." for xvars. # if 'get' is in 'av' use all cols in 'i', fix for bug #34 @@ -1407,7 +1471,7 @@ replace_dot_alias = function(e) { if (length(byval) && length(byval[[1L]])) { if (!bysameorder && isFALSE(byindex)) { - if (verbose) {last.started.at=proc.time();cat("Finding groups using forderv ... ");flush.console()} + if (verbose) {last.started.at=proc.time();catf("Finding groups using forderv ... ");flush.console()} o__ = forderv(byval, sort=keyby, retGrp=TRUE) # The sort= argument is called sortGroups at C level. It's primarily for saving the sort of unique strings at # C level for efficiency when by= not keyby=. Other types also retain appearance order, but at byte level to @@ -1421,7 +1485,7 @@ replace_dot_alias = function(e) { if (verbose) { cat(timetaken(last.started.at),"\n") last.started.at=proc.time() - cat("Finding group sizes from the positions (can be avoided to save RAM) ... ") + catf("Finding group sizes from the positions (can be avoided to save RAM) ... ") flush.console() # for windows } f__ = attr(o__, "starts", exact=TRUE) @@ -1429,7 +1493,7 @@ replace_dot_alias = function(e) { if (verbose) {cat(timetaken(last.started.at),"\n"); flush.console()} if (!bysameorder && !keyby) { # TO DO: lower this into forder.c - if (verbose) {last.started.at=proc.time();cat("Getting back original order ... ");flush.console()} + if (verbose) {last.started.at=proc.time();catf("Getting back original order ... ");flush.console()} firstofeachgroup = o__[f__] if (length(origorder <- forderv(firstofeachgroup))) { f__ = f__[origorder] @@ -1441,11 +1505,11 @@ replace_dot_alias = function(e) { } else { if (verbose) last.started.at=proc.time(); if (bysameorder) { - if (verbose) {cat("Finding groups using uniqlist on key ... ");flush.console()} + if (verbose) {catf("Finding groups using uniqlist on key ... ");flush.console()} f__ = uniqlist(byval) } else { if (!is.character(byindex) || length(byindex)!=1L) stop("Internal error: byindex not the index name") # nocov - if (verbose) {cat("Finding groups using uniqlist on index '", byindex, "' ... ", sep="");flush.console()} + if (verbose) {catf("Finding groups using uniqlist on index '%s' ... ", byindex);flush.console()} o__ = getindex(x, byindex) if (is.null(o__)) stop("Internal error: byindex not found") # nocov f__ = uniqlist(byval, order=o__) @@ -1453,7 +1517,7 @@ replace_dot_alias = function(e) { if (verbose) { cat(timetaken(last.started.at),"\n") last.started.at=proc.time() - cat("Finding group sizes from the positions (can be avoided to save RAM) ... ") + catf("Finding group sizes from the positions (can be avoided to save RAM) ... ") flush.console() # for windows } len__ = uniqlengths(f__, xnrow) @@ -1603,7 +1667,8 @@ replace_dot_alias = function(e) { jl__ = as.list(jsubl[[i_]])[-1L] # just keep the '.' from list(.) jn__ = if (is.null(names(jl__))) rep("", length(jl__)) else names(jl__) idx = unlist(lapply(jl__, function(x) is.name(x) && x == ".I")) - if (any(idx)) jn__[idx & (jn__ == "")] = "I" + if (any(idx)) + jn__[idx & !nzchar(jn__)] = "I" # this & is correct not && jvnames = c(jvnames, jn__) jsubl[[i_]] = jl__ } @@ -1644,9 +1709,9 @@ replace_dot_alias = function(e) { } if (verbose) { if (!identical(oldjsub, jsub)) - cat("lapply optimization changed j from '",deparse(oldjsub),"' to '",deparse(jsub,width.cutoff=200L, nlines=1L),"'\n",sep="") + catf("lapply optimization changed j from '%s' to '%s'\n", deparse(oldjsub), deparse(jsub,width.cutoff=200L, nlines=1L)) else - cat("lapply optimization is on, j unchanged as '",deparse(jsub,width.cutoff=200L, nlines=1L),"'\n",sep="") + catf("lapply optimization is on, j unchanged as '%s'\n", deparse(jsub,width.cutoff=200L, nlines=1L)) } dotN = function(x) is.name(x) && x==".N" # For #334. TODO: Rprof() showed dotN() may be the culprit if iterated (#1470)?; avoid the == which converts each x to character? # FR #971, GForce kicks in on all subsets, no joins yet. Although joins could work with @@ -1656,7 +1721,7 @@ replace_dot_alias = function(e) { GForce = FALSE if ( (is.name(jsub) && jsub==".N") || (jsub %iscall% 'list' && length(jsub)==2L && jsub[[2L]]==".N") ) { GForce = TRUE - if (verbose) cat("GForce optimized j to '",deparse(jsub, width.cutoff=200L, nlines=1L),"'\n",sep="") + if (verbose) catf("GForce optimized j to '%s'\n",deparse(jsub, width.cutoff=200L, nlines=1L)) } } else { # Apply GForce @@ -1666,8 +1731,9 @@ replace_dot_alias = function(e) { # is.symbol() is for #1369, #1974 and #2949 if (!(is.call(q) && is.symbol(q[[1L]]) && is.symbol(q[[2L]]) && (q1 <- q[[1L]]) %chin% gfuns)) return(FALSE) if (!(q2 <- q[[2L]]) %chin% names(SDenv$.SDall) && q2 != ".I") return(FALSE) # 875 - if ((length(q)==2L || identical("na",substring(names(q)[3L], 1L, 2L))) && (!q1 %chin% c("head","tail"))) return(TRUE) - # ... head-tail uses default value n=6 which as of now should not go gforce ^^ + if ((length(q)==2L || (!is.null(names(q)) && startsWith(names(q)[3L], "na"))) && (!q1 %chin% c("head","tail"))) return(TRUE) + # ^^ base::startWith errors on NULL unfortunately + # head-tail uses default value n=6 which as of now should not go gforce ... ^^ # otherwise there must be three arguments, and only in two cases: # 1) head/tail(x, 1) or 2) x[n], n>0 length(q)==3L && length(q3 <- q[[3L]])==1L && is.numeric(q3) && @@ -1690,8 +1756,8 @@ replace_dot_alias = function(e) { jsub[[1L]] = as.name(paste0("g", jsub[[1L]])) if (length(jsub)==3L) jsub[[3L]] = eval(jsub[[3L]], parent.frame()) # tests 1187.3 & 1187.5 } - if (verbose) cat("GForce optimized j to '",deparse(jsub, width.cutoff=200L, nlines=1L),"'\n",sep="") - } else if (verbose) cat("GForce is on, left j unchanged\n"); + if (verbose) catf("GForce optimized j to '%s'\n", deparse(jsub, width.cutoff=200L, nlines=1L)) + } else if (verbose) catf("GForce is on, left j unchanged\n"); } } if (!GForce && !is.name(jsub)) { @@ -1714,9 +1780,9 @@ replace_dot_alias = function(e) { } if (verbose) { if (!identical(oldjsub, jsub)) - cat("Old mean optimization changed j from '",deparse(oldjsub),"' to '",deparse(jsub, width.cutoff=200L, nlines=1L),"'\n",sep="") + catf("Old mean optimization changed j from '%s' to '%s'\n", deparse(oldjsub), deparse(jsub, width.cutoff=200L, nlines=1L)) else - cat("Old mean optimization is on, left j unchanged.\n") + catf("Old mean optimization is on, left j unchanged.\n") } assign("Cfastmean", Cfastmean, SDenv) # Old comments still here for now ... @@ -1726,8 +1792,8 @@ replace_dot_alias = function(e) { # when fastmean can do trim. } } else if (verbose) { - if (getOption("datatable.optimize")<1L) cat("All optimizations are turned off\n") - else cat("Optimization is on but left j unchanged (single plain symbol): '",deparse(jsub, width.cutoff=200L, nlines=1L),"'\n",sep="") + if (getOption("datatable.optimize")<1L) catf("All optimizations are turned off\n") + else catf("Optimization is on but left j unchanged (single plain symbol): '%s'\n", deparse(jsub, width.cutoff=200L, nlines=1L)) } if (byjoin) { groups = i @@ -1756,7 +1822,7 @@ replace_dot_alias = function(e) { # for consistency of empty case in test 184 f__=len__=0L } - if (verbose) {last.started.at=proc.time();cat("Making each group and running j (GForce ",GForce,") ... ",sep="");flush.console()} + if (verbose) {last.started.at=proc.time();catf("Making each group and running j (GForce %s) ... ", GForce);flush.console()} if (GForce) { thisEnv = new.env() # not parent=parent.frame() so that gsum is found for (ii in ansvars) assign(ii, x[[ii]], thisEnv) @@ -1802,7 +1868,7 @@ replace_dot_alias = function(e) { cnames = as.character(bysubl)[-1L] cnames = gsub('^`|`$', '', cnames) # the wrapping backticks that were added above can be removed now, #3378 if (all(cnames %chin% names_x)) { - if (verbose) {last.started.at=proc.time();cat("setkey() after the := with keyby= ... ");flush.console()} + if (verbose) {last.started.at=proc.time();catf("setkey() after the := with keyby= ... ");flush.console()} setkeyv(x,cnames) # TO DO: setkey before grouping to get memcpy benefit. if (verbose) {cat(timetaken(last.started.at),"\n"); flush.console()} } @@ -1829,7 +1895,7 @@ replace_dot_alias = function(e) { setnames(ans,seq_along(bynames),bynames) # TO DO: reinvestigate bynames flowing from dogroups here and simplify } if (byjoin && keyby && !bysameorder) { - if (verbose) {last.started.at=proc.time();cat("setkey() afterwards for keyby=.EACHI ... ");flush.console()} + if (verbose) {last.started.at=proc.time();catf("setkey() afterwards for keyby=.EACHI ... ");flush.console()} setkeyv(ans,names(ans)[seq_along(byval)]) if (verbose) {cat(timetaken(last.started.at),"\n"); flush.console()} } else if (keyby || (haskey(x) && bysameorder && (byjoin || (length(allbyvars) && identical(allbyvars,head(key(x),length(allbyvars))))))) { @@ -1842,7 +1908,7 @@ replace_dot_alias = function(e) { if (length(expr)==2L) # no parameters passed to mean, so defaults of trim=0 and na.rm=FALSE return(call(".External",quote(Cfastmean),expr[[2L]], FALSE)) # return(call(".Internal",expr)) # slightly faster than .External, but R now blocks .Internal in coerce.c from apx Sep 2012 - if (length(expr)==3L && identical("na",substring(names(expr)[3L], 1L, 2L))) # one parameter passed to mean() + if (length(expr)==3L && startsWith(names(expr)[3L], "na")) # one parameter passed to mean() return(call(".External",quote(Cfastmean),expr[[2L]], expr[[3L]])) # faster than .Call assign("nomeanopt",TRUE,parent.frame()) expr # e.g. trim is not optimized, just na.rm @@ -2322,25 +2388,23 @@ split.data.table = function(x, f, drop = FALSE, by, sorted = FALSE, keep.by = TR join = TRUE } dtq[["j"]] = substitute( - list(.ll.tech.split=list(.expr)), - list(.expr = if (join) quote(if(.N == 0L) .SD[0L] else .SD) else as.name(".SD")) # simplify when `nomatch` accept NULL #857 ? + list(.ll.tech.split=list(.expr), .ll.tech.split.names=paste(lapply(.BY, as.character), collapse=".")), + list(.expr = if (join) quote(if(.N == 0L) .SD[0L] else .SD) else as.name(".SD")) ) - by.or.keyby = if (join) "by" else c("by"[!sorted], "keyby"[sorted])[1L] - dtq[[by.or.keyby]] = substitute( # retain order, for `join` and `sorted` it will use order of `i` data.table instead of `keyby`. + dtq[["by"]] = substitute( # retain order, for `join` and `sorted` it will use order of `i` data.table instead of `keyby`. .expr, - list(.expr = if(join) {as.name(".EACHI")} else if (flatten) by else .by) + list(.expr = if (join) as.name(".EACHI") else if (flatten) by else .by) ) + dtq[["keyby"]] = if (join) FALSE else sorted dtq[[".SDcols"]] = if (keep.by) names(x) else setdiff(names(x), if (flatten) by else .by) if (join) dtq[["on"]] = if (flatten) by else .by dtq = as.call(dtq) - if (isTRUE(verbose)) cat("Processing split.data.table with: ", deparse(dtq, width.cutoff=500L), "\n", sep="") + if (isTRUE(verbose)) catf("Processing split.data.table with: %s\n", deparse(dtq, width.cutoff=500L)) tmp = eval(dtq) # add names on list - setattr(ll <- tmp$.ll.tech.split, - "names", - as.character( - if (!flatten) tmp[[.by]] else tmp[, list(.nm.tech.split=paste(unlist(lapply(.SD, as.character)), collapse = ".")), by=by, .SDcols=by]$.nm.tech.split - )) + ll = tmp$.ll.tech.split + nm = tmp$.ll.tech.split.names + setattr(ll, "names", nm) # handle nested split if (flatten || length(by) == 1L) { for (x in ll) .Call(C_unlock, x) @@ -2521,7 +2585,7 @@ setnames = function(x,old,new,skip_absent=FALSE) { } } } - if (any(w <- new==names(x)[i] & Encoding(new)==Encoding(names(x)[i]))) { + if (any(w <- new==names(x)[i] & Encoding(new)==Encoding(names(x)[i]))) { # this & is correct not && w = which(!w) new = new[w] i = i[w] @@ -2933,7 +2997,7 @@ isReallyReal = function(x) { RHS = eval(stub[[3L]], x, enclos) if (is.list(RHS)) RHS = as.character(RHS) # fix for #961 if (length(RHS) != 1L && !operator %chin% c("%in%", "%chin%")){ - if (length(RHS) != nrow(x)) stop(gettextf("RHS of %s is length %d which is not 1 or nrow (%d). For robustness, no recycling is allowed (other than of length 1 RHS). Consider %%in%% instead.", operator, length(RHS), nrow(x), domain="R-data.table"), domain=NA) + if (length(RHS) != nrow(x)) stop(domain=NA, gettextf("RHS of %s is length %d which is not 1 or nrow (%d). For robustness, no recycling is allowed (other than of length 1 RHS). Consider %%in%% instead.", operator, length(RHS), nrow(x))) return(NULL) # DT[colA == colB] regular element-wise vector scan } if ( mode(x[[col]]) != mode(RHS) || # mode() so that doubleLHS/integerRHS and integerLHS/doubleRHS!isReallyReal are optimized (both sides mode 'numeric') @@ -2965,7 +3029,7 @@ isReallyReal = function(x) { ## convert i to data.table with all combinations in rows. if(length(i) > 1L && prod(vapply_1i(i, length)) > 1e4){ ## CJ would result in more than 1e4 rows. This would be inefficient, especially memory-wise #2635 - if (verbose) {cat("Subsetting optimization disabled because the cross-product of RHS values exceeds 1e4, causing memory problems.\n");flush.console()} + if (verbose) {catf("Subsetting optimization disabled because the cross-product of RHS values exceeds 1e4, causing memory problems.\n");flush.console()} return(NULL) } ## Care is needed with names as we construct i @@ -2978,14 +3042,15 @@ isReallyReal = function(x) { i = do.call(CJ, i) setnames(i, colNames) idx = NULL - if(is.null(idx)){ - ## check whether key fits the columns in i. - ## order of key columns makes no difference, as long as they are all upfront in the key, I believe. - if (all(names(i) %chin% head(key(x), length(i)))){ - if (verbose) {cat("Optimized subsetting with key '", paste0( head(key(x), length(i)), collapse = ", "),"'\n",sep="");flush.console()} - idx = integer(0L) ## integer(0L) not NULL! Indicates that x is ordered correctly. - idxCols = head(key(x), length(i)) ## in correct order! - } + if (is.null(idx)) { + ## check whether key fits the columns in i. + ## order of key columns makes no difference, as long as they are all upfront in the key, I believe. + key_head = head(key(x), length(i)) + if (all(names(i) %chin% key_head)) { + if (verbose) {catf("Optimized subsetting with key %s", brackify(key_head)); flush.console()} + idx = integer(0L) ## integer(0L) not NULL! Indicates that x is ordered correctly. + idxCols = key_head ## in correct order! + } } if (is.null(idx)){ if (!getOption("datatable.use.index")) return(NULL) # #1422 @@ -3001,17 +3066,17 @@ isReallyReal = function(x) { } } if (!is.null(idx)){ - if (verbose) {cat("Optimized subsetting with index '", paste0( idxCols, collapse = "__"),"'\n",sep="");flush.console()} + if (verbose) {catf("Optimized subsetting with index '%s'\n", paste0( idxCols, collapse = "__"));flush.console()} } } if (is.null(idx)){ ## if nothing else helped, auto create a new index that can be used if (!getOption("datatable.auto.index")) return(NULL) - if (verbose) {cat("Creating new index '", paste0(names(i), collapse = "__"),"'\n",sep="");flush.console()} - if (verbose) {last.started.at=proc.time();cat("Creating index", paste0(names(i), collapse = "__"), "done in ... ");flush.console()} + if (verbose) {catf("Creating new index '%s'\n", paste0(names(i), collapse = "__"));flush.console()} + if (verbose) {last.started.at=proc.time();catf("Creating index %s done in ...", paste0(names(i), collapse = "__"));flush.console()} setindexv(x, names(i)) if (verbose) {cat(timetaken(last.started.at),"\n");flush.console()} - if (verbose) {cat("Optimized subsetting with index '", paste0(names(i), collapse = "__"),"'\n",sep="");flush.console()} + if (verbose) {catf("Optimized subsetting with index '%s'\n", paste0(names(i), collapse = "__"));flush.console()} idx = attr(attr(x, "index", exact=TRUE), paste0("__", names(i), collapse = ""), exact=TRUE) idxCols = names(i) } @@ -3118,7 +3183,7 @@ isReallyReal = function(x) { } idx_op = match(operators, ops, nomatch=0L) if (any(idx_op %in% c(0L, 6L))) - stop("Invalid operators ", paste(operators[idx_op %in% c(0L, 6L)], collapse=","), ". Only allowed operators are ", paste(ops[1:5], collapse=""), ".") + stop(domain=NA, gettextf("Invalid join operators %s. Only allowed operators are %s.", brackify(operators[idx_op %in% c(0L, 6L)]), brackify(ops[1:5]))) ## the final on will contain the xCol as name, the iCol as value on = iCols names(on) = xCols diff --git a/R/devel.R b/R/devel.R index b0dfb71858..1da19b7c98 100644 --- a/R/devel.R +++ b/R/devel.R @@ -13,7 +13,7 @@ dcf.repo = function(pkg, repo, field, type) { idx = file(file.path(contrib.url(repo, type=type),"PACKAGES")) on.exit(close(idx)) dcf = read.dcf(idx, fields=c("Package",field)) - if (!pkg %in% dcf[,"Package"]) stop(gettextf("There is no package %s in provided repository.", pkg, domain='R-data.table')) + if (!pkg %in% dcf[,"Package"]) stop(domain=NA, gettextf("There is no package %s in provided repository.", pkg)) dcf[dcf[,"Package"]==pkg, field][[1L]] } @@ -28,8 +28,8 @@ update.dev.pkg = function(object="data.table", repo="https://Rdatatable.gitlab.i # get Revision field from remote repository PACKAGES file una = is.na(ups<-dcf.repo(pkg, repo, field, type)) if (una) - cat(sprintf("No revision information found in DESCRIPTION file for %s package. Unsure '%s' is correct field in PACKAGES file in your package repository '%s'. Otherwise package will be re-installed every time, proceeding to installation.\n", - pkg, field, contrib.url(repo, type=type))) + catf("No revision information found in DESCRIPTION file for %s package. Unsure '%s' is correct field in PACKAGES file in your package repository '%s'. Otherwise package will be re-installed every time, proceeding to installation.\n", + pkg, field, contrib.url(repo, type=type)) # see if Revision is different then currently installed Revision, note that installed package will have Revision info only when it was installed from remote devel repo upg = una || !identical(ups, dcf.lib(pkg, field, lib.loc=lib)) # update.dev.pkg fails on windows R 4.0.0, we have to unload package namespace before installing new version #4403 @@ -50,7 +50,7 @@ update.dev.pkg = function(object="data.table", repo="https://Rdatatable.gitlab.i .git = function(quiet=FALSE, lib.loc=NULL) { ans = unname(read.dcf(system.file("DESCRIPTION", package="data.table", lib.loc=lib.loc, mustWork=TRUE), fields="Revision")[, "Revision"]) if (!quiet && is.na(ans)) - cat("Git revision is not available. Most likely data.table was installed from CRAN or local archive.\nGit revision is available when installing from our repositories 'https://Rdatatable.gitlab.io/data.table' and 'https://Rdatatable.github.io/data.table'.\n") + catf("Git revision is not available. Most likely data.table was installed from CRAN or local archive.\nGit revision is available when installing from our repositories 'https://Rdatatable.gitlab.io/data.table' and 'https://Rdatatable.github.io/data.table'.\n") ans } diff --git a/R/duplicated.R b/R/duplicated.R index 1ae7e8a6e4..249a5470c5 100644 --- a/R/duplicated.R +++ b/R/duplicated.R @@ -1,14 +1,12 @@ duplicated.data.table = function(x, incomparables=FALSE, fromLast=FALSE, by=seq_along(x), ...) { if (!cedta()) return(NextMethod("duplicated")) #nocov - if (!identical(incomparables, FALSE)) { + if (!isFALSE(incomparables)) { .NotYetUsed("incomparables != FALSE") } if (nrow(x) == 0L || ncol(x) == 0L) return(logical(0L)) # fix for bug #28 if (is.na(fromLast) || !is.logical(fromLast)) stop("'fromLast' must be TRUE or FALSE") + if (!length(by)) by = NULL #4594 query = .duplicated.helper(x, by) - # fix for bug #44 - unique on null data table returns error (because of 'forderv') - # however, in this case we can bypass having to go to forderv at all. - if (!length(query$by)) return(logical(0L)) if (query$use.keyprefix) { f = uniqlist(shallow(x, query$by)) @@ -27,10 +25,11 @@ duplicated.data.table = function(x, incomparables=FALSE, fromLast=FALSE, by=seq_ unique.data.table = function(x, incomparables=FALSE, fromLast=FALSE, by=seq_along(x), ...) { if (!cedta()) return(NextMethod("unique")) # nocov - if (!identical(incomparables, FALSE)) { + if (!isFALSE(incomparables)) { .NotYetUsed("incomparables != FALSE") } if (nrow(x) <= 1L) return(x) + if (!length(by)) by = NULL #4594 o = forderv(x, by=by, sort=FALSE, retGrp=TRUE) # if by=key(x), forderv tests for orderedness within it quickly and will short-circuit # there isn't any need in unique() to call uniqlist like duplicated does; uniqlist returns a new nrow(x) vector anyway and isn't @@ -105,14 +104,15 @@ uniqueN = function(x, by = if (is.list(x)) seq_along(x) else NULL, na.rm=FALSE) if (is.logical(x)) return(.Call(CuniqueNlogical, x, na.rm=na.rm)) x = as_list(x) } + if (!length(by)) by = NULL #4594 o = forderv(x, by=by, retGrp=TRUE, na.last=if (!na.rm) FALSE else NA) starts = attr(o, 'starts', exact=TRUE) - if (!na.rm) { - length(starts) - } else { + if (na.rm) { # TODO: internal efficient sum # fix for #1771, account for already sorted input sum( (if (length(o)) o[starts] else starts) != 0L) + } else { + length(starts) } } diff --git a/R/fcast.R b/R/fcast.R index dbde95846a..a95f03a448 100644 --- a/R/fcast.R +++ b/R/fcast.R @@ -57,7 +57,7 @@ value_vars = function(value.var, varnames) { valnames = unique(unlist(value.var)) iswrong = which(!valnames %chin% varnames) if (length(iswrong)) - stop("value.var values [", paste(value.var[iswrong], collapse=", "), "] are not found in 'data'.") + stop("value.var values ", brackify(value.var[iswrong]), " are not found in 'data'.") value.var } diff --git a/R/fmelt.R b/R/fmelt.R index 3594fce8ca..009369ea9e 100644 --- a/R/fmelt.R +++ b/R/fmelt.R @@ -3,7 +3,7 @@ # reshape2 package is deprecated since December 2017, so we'll deprecate our # redirection as well -melt <- function(data, ..., na.rm = FALSE, value.name = "value") { +melt = function(data, ..., na.rm = FALSE, value.name = "value") { if (is.data.table(data)) { UseMethod("melt", data) # if data is not data.table and reshape2 is installed, this won't dispatch to reshape2's method; @@ -22,12 +22,172 @@ melt <- function(data, ..., na.rm = FALSE, value.name = "value") { patterns = function(..., cols=character(0L)) { # if ... has no names, names(list(...)) will be ""; # this assures they'll be NULL instead - p = unlist(list(...), use.names = any(nzchar(names(...)))) + L = list(...) + p = unlist(L, use.names = any(nzchar(names(L)))) if (!is.character(p)) stop("Input patterns must be of type character.") - lapply(p, grep, cols) + matched = lapply(p, grep, cols) + # replace with lengths when R 3.2.0 dependency arrives + if (length(idx <- which(sapply(matched, length) == 0L))) + stop('Pattern', if (length(idx) > 1L) 's', ' not found: [', + paste(p[idx], collapse = ', '), ']') + matched } +measure = function(..., sep="_", pattern, cols, multiple.keyword="value.name") { + mcall = match.call() + L = as.list(mcall)[-1] + formal.names = names(formals()) + formal.i.vec = which(names(L) %in% formal.names) + fun.list = L[-formal.i.vec] + user.named = names(fun.list) != "" + is.symb = sapply(fun.list, is.symbol) + bad.i = which((!user.named) & (!is.symb)) + if (length(bad.i)) { + stop("each ... argument to measure must be either a symbol without argument name, or a function with argument name, problems: ", paste(bad.i, collapse=",")) + } + names(fun.list)[!user.named] = sapply(fun.list[!user.named], paste) + fun.list[!user.named] = list(NULL) + # group names error checking. + group.is.formal = names(fun.list) %in% formal.names + if (any(group.is.formal)) { + bad.names = names(fun.list)[group.is.formal] + stop("group names specified in ... conflict with measure argument names; please fix by changing group names: ", paste(bad.names, collapse=",")) + } + # evaluate each value in ... and stop if not function. + for (fun.i in which(user.named)) { + fun = eval(fun.list[[fun.i]], parent.frame(1L)) + if (!is.function(fun) || length(formals(args(fun)))==0) { + stop("each ... argument to measure must be a function with at least one argument, problem: ", names(fun.list)[[fun.i]]) + } + fun.list[[fun.i]] = fun + } + measurev.args = c( + list(fun.list), + L[formal.i.vec], + list(group.desc="... arguments to measure")) + do.call(measurev, measurev.args) +} + +measurev = function(fun.list, sep="_", pattern, cols, multiple.keyword="value.name", group.desc="elements of fun.list"){ + # 1. basic error checking. + if (!missing(sep) && !missing(pattern)) { + stop("both sep and pattern arguments used; must use either sep or pattern (not both)") + } + if (!(is.character(multiple.keyword) && length(multiple.keyword)==1 && !is.na(multiple.keyword) && nchar(multiple.keyword)>0)) { + stop("multiple.keyword must be a character string with nchar>0") + } + if (!is.character(cols)) { + stop("cols must be a character vector of column names") + } + prob.i <- if (is.null(names(fun.list))) { + seq_along(fun.list) + } else { + which(names(fun.list) == "") + } + if (length(prob.i)) { + stop("in measurev, ", group.desc, " must be named, problems: ", paste(prob.i, collapse=",")) + } + err.names.unique = function(err.what, name.vec) { + name.tab = table(name.vec) + bad.counts = name.tab[1 < name.tab] + if (length(bad.counts)) { + stop(err.what, " should be uniquely named, problems: ", paste(names(bad.counts), collapse=",")) + } + } + err.args.groups = function(type, N){ + if (N != length(fun.list)) { + stop("number of ", group.desc, " =", length(fun.list), " must be same as ", type, " =", N) + } + } + err.names.unique(group.desc, names(fun.list)) + # 2. compute initial group data table, used as variable_table attribute. + group.mat = if (!missing(pattern)) { + if (!is.character(pattern)) { + stop("pattern must be character string") + } + match.vec = regexpr(pattern, cols, perl=TRUE) + measure.vec = which(0 < match.vec) + if (length(measure.vec) == 0L) { + stop("pattern did not match any cols, so nothing would be melted; fix by changing pattern") + } + start = attr(match.vec, "capture.start")[measure.vec, , drop=FALSE] + if (is.null(start)) { + stop("pattern must contain at least one capture group (parenthesized sub-pattern)") + } + err.args.groups("number of capture groups in pattern", ncol(start)) + end = attr(match.vec, "capture.length")[measure.vec,]+start-1L + names.mat = matrix(cols[measure.vec], nrow(start), ncol(start)) + substr(names.mat, start, end) + } else { #pattern not specified, so split using sep. + if (!is.character(sep)) { + stop("sep must be character string") + } + list.of.vectors = strsplit(cols, sep, fixed=TRUE) + vector.lengths = sapply(list.of.vectors, length) + n.groups = max(vector.lengths) + if (n.groups == 1) { + stop("each column name results in only one item after splitting using sep, which means that all columns would be melted; to fix please either specify melt on all columns directly without using measure, or use a different sep/pattern specification") + } + err.args.groups("max number of items after splitting column names", n.groups) + measure.vec = which(vector.lengths==n.groups) + do.call(rbind, list.of.vectors[measure.vec]) + } + err.names.unique("measured columns", cols[measure.vec]) + uniq.mat = unique(group.mat) + if (nrow(uniq.mat) < nrow(group.mat)) { + stop("number of unique column IDs =", nrow(uniq.mat), " is less than number of melted columns =", nrow(group.mat), "; fix by changing pattern/sep") + } + colnames(group.mat) = names(fun.list) + group.dt = data.table(group.mat) + # 3. apply conversion functions to group data table. + fun.i.vec = which(!sapply(fun.list, is.null)) + for (group.i in fun.i.vec) { + group.name = names(fun.list)[[group.i]] + fun = fun.list[[group.i]] + if (!is.function(fun) || length(formals(args(fun)))==0) { + stop("in the measurev fun.list, each non-NULL element must be a function with at least one argument, problem: ", group.name) + } + group.val = fun(group.dt[[group.name]]) + if (!(is.atomic(group.val) && length(group.val)==nrow(group.dt))) { + stop("each conversion function must return an atomic vector with same length as its first argument, problem: ", group.name) + } + if (all(is.na(group.val))) { + stop(group.name, " conversion function returned vector of all NA") + } + set(group.dt, j=group.name, value=group.val) + } + group.uniq = unique(group.dt) + if (nrow(group.uniq) < nrow(group.dt)) { + stop("number of unique groups after applying type conversion functions less than number of groups, change type conversion") + } + # 4. compute measure.vars list or vector. + if (multiple.keyword %in% names(fun.list)) {# multiple output columns. + if (!is.character(group.dt[[multiple.keyword]])) { + stop(multiple.keyword, " column class=", class(group.dt[[multiple.keyword]])[[1L]], " after applying conversion function, but must be character") + } + is.other = names(group.dt) != multiple.keyword + if (!any(is.other)) { + stop(multiple.keyword, " is the only group; fix by creating at least one more group") + } + other.values = lapply(group.dt[, is.other, with=FALSE], unique) + other.values$stringsAsFactors = FALSE + other.dt = data.table(do.call(expand.grid, other.values)) + measure.list = structure(list(), variable_table=other.dt) + column.values = unique(group.dt[[multiple.keyword]]) + for (column.val in column.values) { + select.dt = data.table(other.dt) + set(select.dt, j=multiple.keyword, value=column.val) + measure.list[[column.val]] = data.table( + measure.vec, group.dt + )[select.dt, measure.vec, on=names(select.dt)] + } + measure.list + } else {# single output column. + structure(measure.vec, variable_table=group.dt) + } +} + melt.data.table = function(data, id.vars, measure.vars, variable.name = "variable", value.name = "value", ..., na.rm = FALSE, variable.factor = TRUE, value.factor = FALSE, verbose = getOption("datatable.verbose")) { @@ -35,8 +195,11 @@ melt.data.table = function(data, id.vars, measure.vars, variable.name = "variabl if (missing(id.vars)) id.vars=NULL if (missing(measure.vars)) measure.vars = NULL measure.sub = substitute(measure.vars) - if (measure.sub %iscall% "patterns") { - measure.vars = do_patterns(measure.sub, names(data)) + if (is.call(measure.sub)) { + eval.result = eval_with_cols(measure.sub, names(data)) + if (!is.null(eval.result)) { + measure.vars = eval.result + } } if (is.list(measure.vars) && length(measure.vars) > 1L) { meas.nm = names(measure.vars) @@ -62,8 +225,8 @@ melt.data.table = function(data, id.vars, measure.vars, variable.name = "variabl variable.name, value.name, as.logical(na.rm), as.logical(verbose)) setDT(ans) - if (any(duplicated(names(ans)))) { - cat("Duplicate column names found in molten data.table. Setting unique names using 'make.names'\n") + if (anyDuplicated(names(ans))) { + catf("Duplicate column names found in molten data.table. Setting unique names using 'make.names'\n") setnames(ans, make.unique(names(ans))) } setattr(ans, 'sorted', NULL) diff --git a/R/foverlaps.R b/R/foverlaps.R index 8028482abb..fc0b706ccd 100644 --- a/R/foverlaps.R +++ b/R/foverlaps.R @@ -128,7 +128,7 @@ foverlaps = function(x, y, by.x=if (!is.null(key(x))) key(x) else key(y), by.y=k end = yintervals[2L], any =, within =, equal = yintervals) call = construct(head(ynames, -2L), uycols, type) - if (verbose) {last.started.at=proc.time();cat("unique() + setkey() operations done in ...");flush.console()} + if (verbose) {last.started.at=proc.time();catf("unique() + setkey() operations done in ...");flush.console()} uy = unique(y[, eval(call)]) # this started to fail from R 4.1 due to c(POSIXct, numeric) setkey(uy)[, `:=`(lookup = list(list(integer(0L))), type_lookup = list(list(integer(0L))), count=0L, type_count=0L)] if (verbose) {cat(timetaken(last.started.at),"\n"); flush.console()} @@ -154,7 +154,7 @@ foverlaps = function(x, y, by.x=if (!is.null(key(x))) key(x) else key(y), by.y=k .Call(Clookup, uy, nrow(y), indices(uy, y, yintervals, nomatch=0L, roll=roll), maxgap, minoverlap, mult, type, verbose) if (maxgap == 0L && minoverlap == 1L) { # iintervals = tail(names(x), 2L) # iintervals not yet used so commented out for now - if (verbose) {last.started.at=proc.time();cat("binary search(es) done in ...");flush.console()} + if (verbose) {last.started.at=proc.time();catf("binary search(es) done in ...");flush.console()} xmatches = indices(uy, x, xintervals, nomatch=0L, roll=roll) if (verbose) {cat(timetaken(last.started.at),"\n");flush.console()} olaps = .Call(Coverlaps, uy, xmatches, mult, type, nomatch, verbose) diff --git a/R/frank.R b/R/frank.R index 763b8267e5..47e701c4cd 100644 --- a/R/frank.R +++ b/R/frank.R @@ -22,10 +22,13 @@ frankv = function(x, cols=seq_along(x), order=1L, na.last=TRUE, ties.method=c("a if (!length(cols)) stop("x is a list, 'cols' can not be 0-length") } - x = .shallow(x, cols) # shallow copy even if list.. + # need to unlock for #4429 + x = .shallow(x, cols, unlock = TRUE) # shallow copy even if list.. setDT(x) cols = seq_along(cols) if (is.na(na.last)) { + if ("..na_prefix.." %chin% names(x)) + stop("Input column '..na_prefix..' conflicts with data.table internal usage; please rename") set(x, j = "..na_prefix..", value = is_na(x, cols)) order = if (length(order) == 1L) c(1L, rep(order, length(cols))) else c(1L, order) cols = c(ncol(x), cols) @@ -39,6 +42,8 @@ frankv = function(x, cols=seq_along(x), order=1L, na.last=TRUE, ties.method=c("a idx = NULL n = nrow(x) } + if ('..stats_runif..' %chin% names(x)) + stop("Input column '..stats_runif..' conflicts with data.table internal usage; please rename") set(x, idx, '..stats_runif..', stats::runif(n)) order = if (length(order) == 1L) c(rep(order, length(cols)), 1L) else c(order, 1L) cols = c(cols, ncol(x)) diff --git a/R/fread.R b/R/fread.R index 0da96fe0e4..eb765fe639 100644 --- a/R/fread.R +++ b/R/fread.R @@ -21,21 +21,26 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") if (length(encoding) != 1L || !encoding %chin% c("unknown", "UTF-8", "Latin-1")) { stop("Argument 'encoding' must be 'unknown', 'UTF-8' or 'Latin-1'.") } - stopifnot( isTRUEorFALSE(strip.white), isTRUEorFALSE(blank.lines.skip), isTRUEorFALSE(fill), isTRUEorFALSE(showProgress), - isTRUEorFALSE(verbose), isTRUEorFALSE(check.names), isTRUEorFALSE(logical01), isTRUEorFALSE(keepLeadingZeros), isTRUEorFALSE(yaml) ) - stopifnot( isTRUEorFALSE(stringsAsFactors) || (is.double(stringsAsFactors) && length(stringsAsFactors)==1L && 0.0<=stringsAsFactors && stringsAsFactors<=1.0)) - stopifnot( is.numeric(nrows), length(nrows)==1L ) - if (is.na(nrows) || nrows<0L) nrows=Inf # accept -1 to mean Inf, as read.table does + stopifnot( + isTRUEorFALSE(strip.white), isTRUEorFALSE(blank.lines.skip), isTRUEorFALSE(fill), isTRUEorFALSE(showProgress), + isTRUEorFALSE(verbose), isTRUEorFALSE(check.names), isTRUEorFALSE(logical01), isTRUEorFALSE(keepLeadingZeros), isTRUEorFALSE(yaml), + isTRUEorFALSE(stringsAsFactors) || (is.double(stringsAsFactors) && length(stringsAsFactors)==1L && 0.0<=stringsAsFactors && stringsAsFactors<=1.0), + is.numeric(nrows), length(nrows)==1L + ) + nrows=as.double(nrows) #4686 + if (is.na(nrows) || nrows<0) nrows=Inf # accept -1 to mean Inf, as read.table does if (identical(header,"auto")) header=NA - stopifnot(is.logical(header) && length(header)==1L) # TRUE, FALSE or NA - stopifnot(is.numeric(nThread) && length(nThread)==1L) + stopifnot( + is.logical(header) && length(header)==1L, # TRUE, FALSE or NA + is.numeric(nThread) && length(nThread)==1L + ) nThread=as.integer(nThread) stopifnot(nThread>=1L) if (!is.null(text)) { if (!is.character(text)) stop("'text=' is type ", typeof(text), " but must be character.") if (!length(text)) return(data.table()) if (length(text) > 1L) { - cat(text, file=(tmpFile<-tempfile(tmpdir=tmpdir)), sep="\n") # avoid paste0() which could create a new very long single string in R's memory + writeLines(text, tmpFile<-tempfile(tmpdir=tmpdir)) # avoid paste0() which could create a new very long single string in R's memory file = tmpFile on.exit(unlink(tmpFile), add=TRUE) } else { @@ -50,13 +55,11 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") if (input=="" || length(grep('\\n|\\r', input))) { # input is data itself containing at least one \n or \r } else { - if (substring(input,1L,1L)==" ") { + if (startsWith(input, " ")) { stop("input= contains no \\n or \\r, but starts with a space. Please remove the leading space, or use text=, file= or cmd=") } - str6 = substring(input,1L,6L) # avoid grepl() for #2531 - str7 = substring(input,1L,7L) - str8 = substring(input,1L,8L) - if (str7=="ftps://" || str8=="https://") { + str7 = substr(input, 1L, 7L) # avoid grepl() for #2531 + if (str7=="ftps://" || startsWith(input, "https://")) { # nocov start if (!requireNamespace("curl", quietly = TRUE)) stop("Input URL requires https:// connection for which fread() requires 'curl' package which cannot be found. Please install 'curl' using 'install.packages('curl')'.") # nocov @@ -66,7 +69,7 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") on.exit(unlink(tmpFile), add=TRUE) # nocov end } - else if (str6=="ftp://" || str7== "http://" || str7=="file://") { + else if (startsWith(input, "ftp://") || str7== "http://" || str7=="file://") { # nocov start method = if (str7=="file://") "internal" else getOption("download.file.method", default="auto") # force "auto" when file:// to ensure we don't use an invalid option (e.g. wget), #1668 @@ -80,7 +83,7 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") else if (length(grep(' ', input, fixed = TRUE)) && !file.exists(input)) { # file name or path containing spaces is not a command cmd = input if (input_has_vars && getOption("datatable.fread.input.cmd.message", TRUE)) { - message("Taking input= as a system command ('",cmd,"') and a variable has been used in the expression passed to `input=`. Please use fread(cmd=...). There is a security concern if you are creating an app, and the app could have a malicious user, and the app is not running in a secure environment; e.g. the app is running as root. Please read item 5 in the NEWS file for v1.11.6 for more information and for the option to suppress this message.") + message("Taking input= as a system command because it contains a space ('",cmd,"'). If it's a filename please remove the space, or use file= explicitly. A variable is being passed to input= and when this is taken as a system command there is a security concern if you are creating an app, the app could have a malicious user, and the app is not running in a secure environment; e.g. the app is running as root. Please read item 5 in the NEWS file for v1.11.6 for more information and for the option to suppress this message.") } } else { @@ -102,12 +105,10 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") if (data.table) 'data.table' else 'data.frame', ".") return(if (data.table) data.table(NULL) else data.frame(NULL)) } - ext2 = substring(file, nchar(file)-2L, nchar(file)) # last 3 characters ".gz" - ext3 = substring(file, nchar(file)-3L, nchar(file)) # last 4 characters ".bz2" - if (ext2==".gz" || ext3==".bz2") { + if ((is_gz <- endsWith(file, ".gz")) || endsWith(file, ".bz2")) { if (!requireNamespace("R.utils", quietly = TRUE)) stop("To read gz and bz2 files directly, fread() requires 'R.utils' package which cannot be found. Please install 'R.utils' using 'install.packages('R.utils')'.") # nocov - FUN = if (ext2==".gz") gzfile else bzfile + FUN = if (is_gz) gzfile else bzfile R.utils::decompressFile(file, decompFile<-tempfile(tmpdir=tmpdir), ext=NULL, FUN=FUN, remove=FALSE) # ext is not used by decompressFile when destname is supplied, but isn't optional file = decompFile # don't use 'tmpFile' symbol again, as tmpFile might be the http://domain.org/file.csv.gz download on.exit(unlink(decompFile), add=TRUE) @@ -169,9 +170,10 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") yaml_border_re = '^#?---' if (!grepl(yaml_border_re, first_line)) { close(f) - stop('Encountered <', substring(first_line, 1L, 50L), if (nchar(first_line) > 50L) '...', '> at the first ', - 'unskipped line (', 1L+skip, '), which does not constitute the start to a valid YAML header ', - '(expecting something matching regex "', yaml_border_re, '"); please check your input and try again.') + stop(gettextf( + 'Encountered <%s%s> at the first unskipped line (%d), which does not constitute the start to a valid YAML header (expecting something matching regex "%s"); please check your input and try again.', + substr(first_line, 1L, 50L), if (nchar(first_line) > 50L) '...' else '', 1L+skip, yaml_border_re + )) } yaml_comment_re = '^#' @@ -193,7 +195,7 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") yaml_header = yaml::yaml.load(yaml_string) yaml_names = names(yaml_header) - if (verbose) cat('Processed', n_read, 'lines of YAML metadata with the following top-level fields:', brackify(yaml_names), '\n') + if (verbose) catf('Processed %d lines of YAML metadata with the following top-level fields: %s\n', n_read, brackify(yaml_names)) # process header first since it impacts how to handle colClasses if ('header' %chin% yaml_names) { if ('header' %chin% call_args) message("User-supplied 'header' will override that found in metadata.") @@ -201,7 +203,7 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") } if ('schema' %chin% yaml_names) { new_types = sapply(yaml_header$schema$fields, `[[`, 'type') - if (any(null_idx <- sapply(new_types, is.null))) + if (any(null_idx <- vapply_1b(new_types, is.null))) new_types = do.call(c, new_types) synonms = rbindlist(list( character = list(syn = c('character', 'string')), @@ -325,7 +327,7 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") } else { cols_to_factor = which(vapply_1b(ans, is.character)) } - if (verbose) cat("stringsAsFactors=", stringsAsFactors, " converted ", length(cols_to_factor), " column(s): ", brackify(names(ans)[cols_to_factor]), "\n", sep="") + if (verbose) catf("stringsAsFactors=%s converted %d column(s): %s\n", stringsAsFactors, length(cols_to_factor), brackify(names(ans)[cols_to_factor])) for (j in cols_to_factor) set(ans, j=j, value=as_factor(.subset2(ans, j))) } @@ -341,10 +343,10 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") } if (yaml) setattr(ans, 'yaml_metadata', yaml_header) if (!is.null(index) && data.table) { - if (!all(sapply(index, is.character))) + if (!all(vapply_1b(index, is.character))) stop("index argument of data.table() must be a character vector naming columns (NB: col.names are applied before this)") if (is.list(index)) { - to_split = sapply(index, length) == 1L + to_split = vapply_1i(index, length) == 1L if (any(to_split)) index[to_split] = sapply(index[to_split], strsplit, split = ",", fixed = TRUE) } else { diff --git a/R/fwrite.R b/R/fwrite.R index 1971c0e4ea..8325f137d3 100644 --- a/R/fwrite.R +++ b/R/fwrite.R @@ -1,5 +1,6 @@ fwrite = function(x, file="", append=FALSE, quote="auto", - sep=",", sep2=c("","|",""), eol=if (.Platform$OS.type=="windows") "\r\n" else "\n", + sep=getOption("datatable.fwrite.sep", ","), + sep2=c("","|",""), eol=if (.Platform$OS.type=="windows") "\r\n" else "\n", na="", dec=".", row.names=FALSE, col.names=TRUE, qmethod=c("double","escape"), logical01=getOption("datatable.logical01", FALSE), # due to change to TRUE; see NEWS @@ -11,8 +12,12 @@ fwrite = function(x, file="", append=FALSE, quote="auto", compress = c("auto", "none", "gzip"), yaml = FALSE, bom = FALSE, - verbose=getOption("datatable.verbose", FALSE)) { + verbose=getOption("datatable.verbose", FALSE), + encoding = "") { na = as.character(na[1L]) # fix for #1725 + if (length(encoding) != 1L || !encoding %chin% c("", "UTF-8", "native")) { + stop("Argument 'encoding' must be '', 'UTF-8' or 'native'.") + } if (missing(qmethod)) qmethod = qmethod[1L] if (missing(compress)) compress = compress[1L] if (missing(dateTimeAs)) { dateTimeAs = dateTimeAs[1L] } @@ -58,7 +63,7 @@ fwrite = function(x, file="", append=FALSE, quote="auto", file = path.expand(file) # "~/foo/bar" if (append && (file=="" || file.exists(file))) { if (missing(col.names)) col.names = FALSE - if (verbose) cat("Appending to existing file so setting bom=FALSE and yaml=FALSE\n") + if (verbose) catf("Appending to existing file so setting bom=FALSE and yaml=FALSE\n") bom = FALSE yaml = FALSE } @@ -108,7 +113,7 @@ fwrite = function(x, file="", append=FALSE, quote="auto", file = enc2native(file) # CfwriteR cannot handle UTF-8 if that is not the native encoding, see #3078. .Call(CfwriteR, x, file, sep, sep2, eol, na, dec, quote, qmethod=="escape", append, row.names, col.names, logical01, scipen, dateTimeAs, buffMB, nThread, - showProgress, is_gzip, bom, yaml, verbose) + showProgress, is_gzip, bom, yaml, verbose, encoding) invisible() } diff --git a/R/groupingsets.R b/R/groupingsets.R index 6281615dd5..2300d09da0 100644 --- a/R/groupingsets.R +++ b/R/groupingsets.R @@ -27,10 +27,12 @@ cube.data.table = function(x, j, by, .SDcols, id = FALSE, ...) { stop("Argument 'by' must be a character vector of column names used in grouping.") if (!is.logical(id)) stop("Argument 'id' must be a logical scalar.") + if (missing(j)) + stop("Argument 'j' is required") # generate grouping sets for cube - power set: http://stackoverflow.com/a/32187892/2490497 n = length(by) keepBool = sapply(2L^(seq_len(n)-1L), function(k) rep(c(FALSE, TRUE), times=k, each=((2L^n)/(2L*k)))) - sets = lapply((2L^n):1L, function(j) by[keepBool[j, ]]) + sets = lapply((2L^n):1L, function(jj) by[keepBool[jj, ]]) # redirect to workhorse function jj = substitute(j) groupingsets.data.table(x, by=by, sets=sets, .SDcols=.SDcols, id=id, jj=jj) @@ -51,7 +53,7 @@ groupingsets.data.table = function(x, j, by, sets, .SDcols, id = FALSE, jj, ...) stop("Argument 'by' must be a character vector of column names used in grouping.") if (anyDuplicated(by) > 0L) stop("Argument 'by' must have unique column names for grouping.") - if (!is.list(sets) || !all(sapply(sets, is.character))) + if (!is.list(sets) || !all(vapply_1b(sets, is.character))) stop("Argument 'sets' must be a list of character vectors.") if (!is.logical(id)) stop("Argument 'id' must be a logical scalar.") @@ -60,7 +62,7 @@ groupingsets.data.table = function(x, j, by, sets, .SDcols, id = FALSE, jj, ...) stop("All columns used in 'sets' argument must be in 'by' too. Columns used in 'sets' but not present in 'by': ", brackify(setdiff(sets.all.by, by))) if (id && "grouping" %chin% names(x)) stop("When using `id=TRUE` the 'x' data.table must not have a column named 'grouping'.") - if (any(sapply(sets, anyDuplicated))) + if (any(vapply_1i(sets, anyDuplicated))) # anyDuplicated returns index of first duplicate, otherwise 0L stop("Character vectors in 'sets' list must not have duplicated column names within a single grouping set.") if (length(sets) > 1L && (idx<-anyDuplicated(lapply(sets, sort)))) warning("'sets' contains a duplicate (i.e., equivalent up to sorting) element at index ", idx, "; as such, there will be duplicate rows in the output -- note that grouping by A,B and B,A will produce the same aggregations. Use `sets=unique(lapply(sets, sort))` to eliminate duplicates.") diff --git a/R/last.R b/R/last.R index abf4050b40..8dff3271a1 100644 --- a/R/last.R +++ b/R/last.R @@ -7,12 +7,12 @@ last = function(x, n=1L, ...) { if (nargs()>1L) { if ("package:xts" %chin% search()) { if (verbose) - cat("last: using xts::last: !is.xts(x) & nargs>1 & 'package:xts'%in%search()\n") + catf("%s: using %s: %s\n", "last", "xts::last", "!is.xts(x) & nargs>1 & 'package:xts'%in%search()") xts::last(x, n=n, ...) } else { # nocov start if (verbose) - cat("last: using utils::tail: !is.xts(x) & nargs>1 & !'package:xts'%in%search()\n") + catf("%s: using %s: %s\n", "last", "utils::tail", "!is.xts(x) & nargs>1 & !'package:xts'%in%search()") utils::tail(x, n=n, ...) # nocov end } @@ -20,24 +20,24 @@ last = function(x, n=1L, ...) { dx = dim(x) if (is.null(dx)) { if (verbose) - cat("last: using 'x[[length(x)]]': !is.xts(x) & !nargs>1 & is.null(dim(x))\n") + catf("%s: using %s: %s\n", "last", "'x[[length(x)]]'", "!is.xts(x) & !nargs>1 & is.null(dim(x))") lx = length(x) if (!lx) x else x[[lx]] } else if (is.data.frame(x)) { if (verbose) - cat("last: using 'x[nrow(x),]': !is.xts(x) & !nargs>1 & is.data.frame(x)\n") + catf("%s: using %s: %s\n", "last", "'x[nrow(x),]'", "!is.xts(x) & !nargs>1 & is.data.frame(x)") x[dx[1L], , drop=FALSE] } else { if (verbose) - cat("last: using utils::tail: !is.xts(x) & !nargs>1 & !is.null(dim(x)) & !is.data.frame(x)\n") + catf("%s: using %s: %s\n", "last", "utils::tail", "!is.xts(x) & !nargs>1 & !is.null(dim(x)) & !is.data.frame(x)") utils::tail(x, n=n, ...) } } } else { if (!requireNamespace("xts", quietly=TRUE)) - stop(gettextf("'xts' class passed to %s function but 'xts' is not available, you should have 'xts' installed already", "data.table::last", domain="R-data.table")) # nocov + stop(domain=NA, gettextf("'xts' class passed to %s function but 'xts' is not available, you should have 'xts' installed already", "data.table::last")) # nocov if (verbose) - cat("last: using xts::last: is.xts(x)\n") + catf("%s: using %s: %s\n", "last", "xts::last", "is.xts(x)") xts::last(x, n=n, ...) } } @@ -48,12 +48,12 @@ first = function(x, n=1L, ...) { if (nargs()>1L) { if ("package:xts" %chin% search()) { if (verbose) - cat("first: using xts::first: !is.xts(x) & nargs>1 & 'package:xts'%in%search()\n") + catf("%s: using %s: %s\n", "first", "xts::first", "!is.xts(x) & nargs>1 & 'package:xts'%in%search()") xts::first(x, n=n, ...) } else { # nocov start if (verbose) - cat("first: using utils::head: !is.xts(x) & nargs>1 & !'package:xts'%in%search()\n") + catf("%s: using %s: %s\n", "first", "utils::head", "!is.xts(x) & nargs>1 & !'package:xts'%in%search()") utils::head(x, n=n, ...) # nocov end } @@ -61,24 +61,24 @@ first = function(x, n=1L, ...) { dx = dim(x) if (is.null(dx)) { if (verbose) - cat("first: using 'x[[1L]]': !is.xts(x) & !nargs>1 & is.null(dim(x))\n") + catf("%s: using %s: %s\n", "first", "'x[[1L]]'", "!is.xts(x) & !nargs>1 & is.null(dim(x))") lx = length(x) if (!lx) x else x[[1L]] } else if (is.data.frame(x)) { if (verbose) - cat("first: using 'x[1L,]': !is.xts(x) & !nargs>1 & is.data.frame(x)\n") + catf("%s: using %s: %s\n", "first", "'x[1L,]'", "!is.xts(x) & !nargs>1 & is.data.frame(x)") if (!dx[1L]) x else x[1L, , drop=FALSE] } else { if (verbose) - cat("first: using utils::head: !is.xts(x) & !nargs>1 & !is.null(dim(x)) & !is.data.frame(x)\n") + catf("%s: using %s: %s\n", "first", "utils::head", "!is.xts(x) & !nargs>1 & !is.null(dim(x)) & !is.data.frame(x)") utils::head(x, n=n, ...) } } } else { if (!requireNamespace("xts", quietly=TRUE)) - stop(gettextf("'xts' class passed to %s function but 'xts' is not available, you should have 'xts' installed already", "data.table::first", domain="R-data.table")) # nocov + stop(domain=NA, gettextf("'xts' class passed to %s function but 'xts' is not available, you should have 'xts' installed already", "data.table::first")) # nocov if (verbose) - cat("first: using xts::first: is.xts(x)\n") + catf("%s: using %s: %s\n", "first", "xts::first", "is.xts(x)") xts::first(x, n=n, ...) } } diff --git a/R/like.R b/R/like.R index c66678c643..dd2a8c5b59 100644 --- a/R/like.R +++ b/R/like.R @@ -3,7 +3,10 @@ # returns 'logical' so can be combined with other where clauses. like = function(vector, pattern, ignore.case = FALSE, fixed = FALSE) { if (is.factor(vector)) { - as.integer(vector) %in% grep(pattern, levels(vector), ignore.case = ignore.case, fixed = fixed) + # indexing by factors is equivalent to indexing by the numeric codes, see ?`[` #4748 + ret = grepl(pattern, levels(vector), ignore.case = ignore.case, fixed = fixed)[vector] + ret[is.na(ret)] = FALSE + ret } else { # most usually character, but integer and numerics will be silently coerced by grepl grepl(pattern, vector, ignore.case = ignore.case, fixed = fixed) diff --git a/R/merge.R b/R/merge.R index fe3bdb4549..8dc59e018b 100644 --- a/R/merge.R +++ b/R/merge.R @@ -11,9 +11,17 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL by = key(x) } } - if ((x0 <- length(x)==0L) | (y0 <- length(y)==0L)) warning("You are trying to join data.tables where ", if(x0 & y0) "'x' and 'y' arguments are" else if(x0 & !y0) "'x' argument is" else if(!x0 & y0) "'y' argument is", " 0 columns data.table.") - if (any(duplicated(names(x)))) stop("x has some duplicated column name(s): ",paste(names(x)[duplicated(names(x))],collapse=","),". Please remove or rename the duplicate(s) and try again.") - if (any(duplicated(names(y)))) stop("y has some duplicated column name(s): ",paste(names(y)[duplicated(names(y))],collapse=","),". Please remove or rename the duplicate(s) and try again.") + x0 = length(x)==0L + y0 = length(y)==0L + if (x0 || y0) warning(sprintf(ngettext(x0+y0, + "You are trying to join data.tables where %s has 0 columns.", + "You are trying to join data.tables where %s have 0 columns."), + if (x0 && y0) "'x' and 'y'" else if (x0) "'x'" else "'y'" + )) + nm_x = names(x) + nm_y = names(y) + if (anyDuplicated(nm_x)) stop(gettextf("%s has some duplicated column name(s): %s. Please remove or rename the duplicate(s) and try again.", "x", brackify(nm_x[duplicated(nm_x)]))) + if (anyDuplicated(nm_y)) stop(gettextf("%s has some duplicated column name(s): %s. Please remove or rename the duplicate(s) and try again.", "y", brackify(nm_y[duplicated(nm_y)]))) ## set up 'by'/'by.x'/'by.y' if ( (!is.null(by.x) || !is.null(by.y)) && length(by.x)!=length(by.y) ) @@ -21,11 +29,11 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL if (!missing(by) && !missing(by.x)) warning("Supplied both `by` and `by.x/by.y`. `by` argument will be ignored.") if (!is.null(by.x)) { - if (length(by.x) == 0L || !is.character(by.x) || !is.character(by.y)) + if (length(by.x)==0L || !is.character(by.x) || !is.character(by.y)) stop("A non-empty vector of column names is required for `by.x` and `by.y`.") - if (!all(by.x %chin% names(x))) + if (!all(by.x %chin% nm_x)) stop("Elements listed in `by.x` must be valid column names in x.") - if (!all(by.y %chin% names(y))) + if (!all(by.y %chin% nm_y)) stop("Elements listed in `by.y` must be valid column names in y.") by = by.x names(by) = by.y @@ -35,10 +43,10 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL if (is.null(by)) by = key(x) if (is.null(by)) - by = intersect(names(x), names(y)) + by = intersect(nm_x, nm_y) if (length(by) == 0L || !is.character(by)) stop("A non-empty vector of column names for `by` is required.") - if (!all(by %chin% intersect(colnames(x), colnames(y)))) + if (!all(by %chin% intersect(nm_x, nm_y))) stop("Elements listed in `by` must be valid column names in x and y") by = unname(by) by.x = by.y = by @@ -47,8 +55,8 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL ## sidestep the auto-increment column number feature-leading-to-bug by ## ensuring no names end in ".1", see unit test ## "merge and auto-increment columns in y[x]" in test-data.frame.like.R - start = setdiff(names(x), by.x) - end = setdiff(names(y), by.y) + start = setdiff(nm_x, by.x) + end = setdiff(nm_y, by.y) dupnames = intersect(start, end) if (length(dupnames)) { start[chmatch(dupnames, start, 0L)] = paste0(dupnames, suffixes[1L]) @@ -68,7 +76,7 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL missingyidx = y[!x, which=TRUE, on=by, allow.cartesian=allow.cartesian] if (length(missingyidx)) { yy = y[missingyidx] - othercolsx = setdiff(names(x), by) + othercolsx = setdiff(nm_x, by) if (length(othercolsx)) { tmp = rep.int(NA_integer_, length(missingyidx)) # TO DO: use set() here instead.. @@ -80,7 +88,7 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL } } # X[Y] syntax puts JIS i columns at the end, merge likes them alongside i. - newend = setdiff(names(y), by.y) + newend = setdiff(nm_y, by.y) # fix for #1290, make sure by.y order is set properly before naming setcolorder(dt, c(by.y, setdiff(names(dt), c(by.y, newend)), newend)) setnames(dt, c(by.x, start, end)) diff --git a/R/onAttach.R b/R/onAttach.R index 75b48eb394..3e93187e2e 100644 --- a/R/onAttach.R +++ b/R/onAttach.R @@ -19,13 +19,14 @@ dev = as.integer(v[1L, 3L]) %% 2L == 1L # version number odd => dev if (!isTRUE(getOption("datatable.quiet"))) { # new option in v1.12.4, #3489 packageStartupMessage("data.table ", v, if(dev)paste0(" IN DEVELOPMENT built ",d,g), - " using ", getDTthreads(verbose=FALSE), " threads (see ?getDTthreads). Latest news: r-datatable.com") - if (gettext("TRANSLATION CHECK", domain='R-data.table') != "TRANSLATION CHECK") - packageStartupMessage(gettext("**********\nRunning data.table in English; package support is available in English only. When searching for online help, be sure to also check for the English error message. This can be obtained by looking at the po/R-.po and po/.po files in the package source, where the native language and English error messages can be found side-by-side\n**********", domain="R-data.table")) + " using ", getDTthreads(verbose=FALSE), " threads (see ?getDTthreads). Latest news: r-datatable.com", domain="R-data.table") + # NB: domain= is necessary in .onAttach and .onLoad, see ?gettext and https://bugs.r-project.org/bugzilla/show_bug.cgi?id=18092. + if (gettext(domain="R-data.table", "TRANSLATION CHECK") != "TRANSLATION CHECK") + packageStartupMessage(domain="R-data.table", "**********\nRunning data.table in English; package support is available in English only. When searching for online help, be sure to also check for the English error message. This can be obtained by looking at the po/R-.po and po/.po files in the package source, where the native language and English error messages can be found side-by-side\n**********") if (dev && (Sys.Date() - as.Date(d))>28L) - packageStartupMessage("**********\nThis development version of data.table was built more than 4 weeks ago. Please update: data.table::update.dev.pkg()\n**********") + packageStartupMessage(domain="R-data.table", "**********\nThis development version of data.table was built more than 4 weeks ago. Please update: data.table::update.dev.pkg()\n**********") if (!.Call(ChasOpenMP)) - packageStartupMessage("**********\n", + packageStartupMessage(domain="R-data.table", "**********\n", "This installation of data.table has not detected OpenMP support. It should still work but in single-threaded mode.\n", if (Sys.info()["sysname"]=="Darwin") "This is a Mac. Please read https://mac.r-project.org/openmp/. Please engage with Apple and ask them for support. Check r-datatable.com for updates, and our Mac instructions here: https://github.com/Rdatatable/data.table/wiki/Installation. After several years of many reports of installation problems on Mac, it's time to gingerly point out that there have been no similar problems on Windows or Linux." diff --git a/R/onLoad.R b/R/onLoad.R index 230929c4b6..3750510ece 100644 --- a/R/onLoad.R +++ b/R/onLoad.R @@ -25,11 +25,12 @@ if (dllV != RV) { dll = if (.Platform$OS.type=="windows") "dll" else "so" # https://bugs.r-project.org/bugzilla/show_bug.cgi?id=17478 - stop("The datatable.",dll," version (",dllV,") does not match the package (",RV,"). Please close all R sessions to release the old ",toupper(dll)," and reinstall data.table in a fresh R session. The root cause is that R's package installer can in some unconfirmed circumstances leave a package in a state that is apparently functional but where new R code is calling old C code silently: https://bugs.r-project.org/bugzilla/show_bug.cgi?id=17478. Once a package is in this mismatch state it may produce wrong results silently until you next upgrade the package. Please help by adding precise circumstances to 17478 to move the status to confirmed. This mismatch between R and C code can happen with any package not just data.table. It is just that data.table has added this check.") + # NB: domain= is necessary in .onAttach and .onLoad, see ?gettext and https://bugs.r-project.org/bugzilla/show_bug.cgi?id=18092. + stop(domain="R-data.table", "The datatable.",dll," version (",dllV,") does not match the package (",RV,"). Please close all R sessions to release the old ",toupper(dll)," and reinstall data.table in a fresh R session. The root cause is that R's package installer can in some unconfirmed circumstances leave a package in a state that is apparently functional but where new R code is calling old C code silently: https://bugs.r-project.org/bugzilla/show_bug.cgi?id=17478. Once a package is in this mismatch state it may produce wrong results silently until you next upgrade the package. Please help by adding precise circumstances to 17478 to move the status to confirmed. This mismatch between R and C code can happen with any package not just data.table. It is just that data.table has added this check.") } builtUsing = readRDS(system.file("Meta/package.rds",package="data.table"))$Built$R if (!identical(base::getRversion()>="4.0.0", builtUsing>="4.0.0")) { - stop("This is R ", base::getRversion(), " but data.table has been installed using R ",builtUsing,". The major version must match. Please reinstall data.table.") + stop(domain="R-data.table", "This is R ", base::getRversion(), " but data.table has been installed using R ",builtUsing,". The major version must match. Please reinstall data.table.") # the if(R>=4.0.0) in NAMESPACE when registering S3 methods rbind.data.table and cbind.data.table happens on install; #3968 } } @@ -93,14 +94,14 @@ } if (!is.null(getOption("datatable.old.bywithoutby"))) - warning("Option 'datatable.old.bywithoutby' has been removed as warned for 2 years. It is now ignored. Please use by=.EACHI instead and stop using this option.") + warning(domain="R-data.table", "Option 'datatable.old.bywithoutby' has been removed as warned for 2 years. It is now ignored. Please use by=.EACHI instead and stop using this option.") if (!is.null(getOption("datatable.old.unique.by.key"))) - warning("Option 'datatable.old.unique.by.key' has been removed as warned for 4 years. It is now ignored. Please use by=key(DT) instead and stop using this option.") + warning(domain="R-data.table", "Option 'datatable.old.unique.by.key' has been removed as warned for 4 years. It is now ignored. Please use by=key(DT) instead and stop using this option.") # Test R behaviour that changed in v3.1 and is now depended on x = 1L:3L y = list(x) - if (address(x) != address(y[[1L]])) stop("Unexpected base R behaviour: list(x) has copied x") + if (address(x) != address(y[[1L]])) stop(domain="R-data.table", "Unexpected base R behaviour: list(x) has copied x") DF = data.frame(a=1:3, b=4:6) add1 = address(DF$a) @@ -108,7 +109,7 @@ names(DF) = c("A","B") add3 = address(DF$A) add4 = address(DF$B) - if (add1!=add3 || add2!=add4) stop("Unexpected base R behaviour: names<- has copied column contents") + if (add1!=add3 || add2!=add4) stop(domain="R-data.table", "Unexpected base R behaviour: names<- has copied column contents") DF = data.frame(a=1:3, b=4:6) add1 = address(DF$a) @@ -118,10 +119,10 @@ add4 = address(DF$a) add5 = address(DF$b) add6 = address(DF) - if (add2==add5) stop("Unexpected base R behaviour: DF[2,2]<- did not copy column 2 which was assigned to") - if (add1!=add4) stop("Unexpected base R behaviour: DF[2,2]<- copied the first column which was not assigned to, too") + if (add2==add5) stop(domain="R-data.table", "Unexpected base R behaviour: DF[2,2]<- did not copy column 2 which was assigned to") + if (add1!=add4) stop(domain="R-data.table", "Unexpected base R behaviour: DF[2,2]<- copied the first column which was not assigned to, too") - if (add3==add6) warning("Unexpected base R behaviour: DF[2,2]<- has not copied address(DF)") + if (add3==add6) warning(domain="R-data.table", "Unexpected base R behaviour: DF[2,2]<- has not copied address(DF)") # R could feasibly in future not copy DF's vecsxp in this case. If that changes in R, we'd like to know via the warning # because tests will likely break too. The warning will quickly tell R-core and us why, so we can then update. diff --git a/R/print.data.table.R b/R/print.data.table.R index 31a009d5b4..4e666ca22e 100644 --- a/R/print.data.table.R +++ b/R/print.data.table.R @@ -15,6 +15,8 @@ print.data.table = function(x, topn=getOption("datatable.print.topn"), # trunc.cols - should only the columns be printed that can fit in the console? (FALSE) if (!col.names %chin% c("auto", "top", "none")) stop("Valid options for col.names are 'auto', 'top', and 'none'") + if (length(trunc.cols) != 1L || !is.logical(trunc.cols) || is.na(trunc.cols)) + stop("Valid options for trunc.cols are TRUE and FALSE") if (col.names == "none" && class) warning("Column classes will be suppressed when col.names is 'none'") if (!shouldPrint(x)) { @@ -41,31 +43,34 @@ print.data.table = function(x, topn=getOption("datatable.print.topn"), if (!is.numeric(topn)) topn = 5L topnmiss = missing(topn) topn = max(as.integer(topn),1L) - if (print.keys){ + if (print.keys) { if (!is.null(ky <- key(x))) - cat("Key: <", paste(ky, collapse=", "), ">\n", sep="") + catf("Key: <%s>\n", toString(ky)) if (!is.null(ixs <- indices(x))) - cat("Ind", if (length(ixs) > 1L) "ices" else "ex", ": <", - paste(ixs, collapse=">, <"), ">\n", sep="") + cat(sprintf( + ngettext(length(ixs), "Index: %s\n", "Indices: %s\n"), + paste0("<", ixs, ">", collapse = ", ") + )) } if (any(dim(x)==0L)) { class = if (is.data.table(x)) "table" else "frame" # a data.frame could be passed to print.data.table() directly, #3363 if (all(dim(x)==0L)) { - cat("Null data.",class," (0 rows and 0 cols)\n", sep="") # See FAQ 2.5 and NEWS item in v1.8.9 + catf("Null data.%s (0 rows and 0 cols)\n", class) # See FAQ 2.5 and NEWS item in v1.8.9 } else { - cat("Empty data.",class," (", dim(x)[1L], " rows and ",length(x)," cols)", sep="") + catf("Empty data.%s (%d rows and %d cols)", class, NROW(x), NCOL(x)) if (length(x)>0L) cat(": ",paste(head(names(x),6L),collapse=","),if(length(x)>6L)"...",sep="") cat("\n") } return(invisible(x)) } - if ((topn*2L+1L)nrows || !topnmiss)) { + n_x = nrow(x) + if ((topn*2L+1L)nrows || !topnmiss)) { toprint = rbindlist(list(head(x, topn), tail(x, topn)), use.names=FALSE) # no need to match names because head and tail of same x, and #3306 - rn = c(seq_len(topn), seq.int(to=nrow(x), length.out=topn)) + rn = c(seq_len(topn), seq.int(to=n_x, length.out=topn)) printdots = TRUE } else { toprint = x - rn = seq_len(nrow(x)) + rn = seq_len(n_x) printdots = FALSE } toprint=format.data.table(toprint, na.encode=FALSE, timezone = timezone, ...) # na.encode=FALSE so that NA in character cols print as @@ -93,7 +98,7 @@ print.data.table = function(x, topn=getOption("datatable.print.topn"), if (quote) colnames(toprint) <- paste0('"', old <- colnames(toprint), '"') if (isTRUE(trunc.cols)) { # allow truncation of columns to print only what will fit in console PR #4074 - widths = dt_width(toprint, class, row.names, col.names) + widths = dt_width(toprint, n_x, class, row.names, col.names) cons_width = getOption("width") cols_to_print = widths < cons_width not_printed = colnames(toprint)[!cols_to_print] @@ -109,7 +114,7 @@ print.data.table = function(x, topn=getOption("datatable.print.topn"), toprint = rbind(head(toprint, topn + isTRUE(class)), "---"="", tail(toprint, topn)) rownames(toprint) = format(rownames(toprint), justify="right") if (col.names == "none") { - cut_top(print(toprint, right=TRUE, quote=quote)) + cut_colnames(print(toprint, right=TRUE, quote=quote)) } else { print(toprint, right=TRUE, quote=quote) } @@ -124,7 +129,7 @@ print.data.table = function(x, topn=getOption("datatable.print.topn"), # option to shut this off per request of Oleg Bondar on SO, #1482 toprint=rbind(toprint, matrix(if (quote) old else colnames(toprint), nrow=1L)) # fixes bug #97 if (col.names == "none") { - cut_top(print(toprint, right=TRUE, quote=quote)) + cut_colnames(print(toprint, right=TRUE, quote=quote)) } else { print(toprint, right=TRUE, quote=quote) } @@ -187,7 +192,8 @@ shouldPrint = function(x) { # for removing the head (column names) of matrix output entirely, # as opposed to printing a blank line, for excluding col.names per PR #1483 -cut_top = function(x) cat(capture.output(x)[-1L], sep = '\n') +# be sure to remove colnames from any row where they exist, #4270 +cut_colnames = function(x) writeLines(grep("^\\s*(?:[0-9]+:|---)", capture.output(x), value=TRUE)) # for printing the dims for list columns #3671; used by format.data.table() paste_dims = function(x) { @@ -202,12 +208,13 @@ paste_dims = function(x) { # to calculate widths of data.table for PR #4074 # gets the width of the data.table at each column # and compares it to the console width -dt_width = function(x, class, row.names, col.names) { +# pass nrow because x is the head/tail only so nrow(x) is wrong, #4266 +dt_width = function(x, nrow, class, row.names, col.names) { widths = apply(nchar(x, type='width'), 2L, max) if (class) widths = pmax(widths, 6L) - if (col.names != "none") names = sapply(colnames(x), nchar, type = "width") else names = 0L + if (col.names != "none") names = sapply(colnames(x), nchar, type="width") else names = 0L dt_widths = pmax(widths, names) - rownum_width = if (row.names) as.integer(ceiling(log10(nrow(x)))+2) else 0L + rownum_width = if (row.names) as.integer(ceiling(log10(nrow))+2) else 0L cumsum(dt_widths + 1L) + rownum_width } # keeps the dim and dimnames attributes diff --git a/R/setkey.R b/R/setkey.R index 1f3763b1f6..e9f18398ab 100644 --- a/R/setkey.R +++ b/R/setkey.R @@ -88,12 +88,12 @@ setkeyv = function(x, cols, verbose=getOption("datatable.verbose"), physical=TRU if (verbose) { tt = suppressMessages(system.time(o <- forderv(x, cols, sort=TRUE, retGrp=FALSE))) # system.time does a gc, so we don't want this always on, until refcnt is on by default in R # suppress needed for tests 644 and 645 in verbose mode - cat("forder took", tt["user.self"]+tt["sys.self"], "sec\n") + catf("forder took %.03f sec\n", tt["user.self"]+tt["sys.self"]) } else { o = forderv(x, cols, sort=TRUE, retGrp=FALSE) } } else { - if (verbose) cat("setkey on columns ", brackify(cols), " using existing index '", newkey, "'\n", sep="") + if (verbose) catf("setkey on columns %s using existing index '%s'\n", brackify(cols), newkey) o = getindex(x, newkey) } if (!physical) { @@ -105,9 +105,9 @@ setkeyv = function(x, cols, verbose=getOption("datatable.verbose"), physical=TRU if (length(o)) { if (verbose) { last.started.at = proc.time() } .Call(Creorder,x,o) - if (verbose) { cat("reorder took", timetaken(last.started.at), "\n"); flush.console() } + if (verbose) { catf("reorder took %s\n", timetaken(last.started.at)); flush.console() } } else { - if (verbose) cat("x is already ordered by these columns, no need to call reorder\n") + if (verbose) catf("x is already ordered by these columns, no need to call reorder\n") } # else empty integer() from forderv means x is already ordered by those cols, nothing to do. setattr(x,"sorted",cols) invisible(x) @@ -184,7 +184,7 @@ forderv = function(x, by=seq_along(x), retGrp=FALSE, sort=TRUE, order=1L, na.las forder = function(..., na.last=TRUE, decreasing=FALSE) { sub = substitute(list(...)) - tt = sapply(sub, function(x) is.null(x) || (is.symbol(x) && !nzchar(x))) + tt = vapply_1b(sub, function(x) is.null(x) || (is.symbol(x) && !nzchar(x))) if (any(tt)) sub[tt] = NULL # remove any NULL or empty arguments; e.g. test 1962.052: forder(DT, NULL) and forder(DT, ) if (length(sub)<2L) return(NULL) # forder() with no arguments returns NULL consistent with base::order asc = rep.int(1L, length(sub)-1L) # ascending (1) or descending (-1) per column @@ -295,7 +295,7 @@ setorderv = function(x, cols = colnames(x), order=1L, na.last=FALSE) o = forderv(x, cols, sort=TRUE, retGrp=FALSE, order=order, na.last=na.last) if (length(o)) { .Call(Creorder, x, o) - if (is.data.frame(x) & !is.data.table(x)) { + if (is.data.frame(x) && !is.data.table(x)) { setattr(x, 'row.names', rownames(x)[o]) } k = key(x) @@ -352,7 +352,7 @@ CJ = function(..., sorted = TRUE, unique = FALSE) } } nrow = prod( vapply_1i(l, length) ) # lengths(l) will work from R 3.2.0 - if (nrow > .Machine$integer.max) stop(gettextf("Cross product of elements provided to CJ() would result in %.0f rows which exceeds .Machine$integer.max == %d", nrow, .Machine$integer.max, domain='R-data.table')) + if (nrow > .Machine$integer.max) stop(domain=NA, gettextf("Cross product of elements provided to CJ() would result in %.0f rows which exceeds .Machine$integer.max == %d", nrow, .Machine$integer.max)) l = .Call(Ccj, l) setDT(l) l = setalloccol(l) # a tiny bit wasteful to over-allocate a fixed join table (column slots only), doing it anyway for consistency since diff --git a/R/setops.R b/R/setops.R index b6dcd7b0b2..d8fcb9dfcf 100644 --- a/R/setops.R +++ b/R/setops.R @@ -63,7 +63,7 @@ fintersect = function(x, y, all=FALSE) { x = shallow(x)[, ".seqn" := rowidv(x)] y = shallow(y)[, ".seqn" := rowidv(y)] jn.on = c(".seqn",setdiff(names(y),".seqn")) - # fixes #4716 by preserving order of 1st (uses y[x] join) argument instead of 2nd (uses x[y] join) + # fixes #4716 by preserving order of 1st (uses y[x] join) argument instead of 2nd (uses x[y] join) y[x, .SD, .SDcols=setdiff(names(y),".seqn"), nomatch=NULL, on=jn.on] } else { z = funique(x) # fixes #3034. When .. prefix in i= is implemented (TODO), this can be x[funique(..y), on=, multi=] @@ -154,17 +154,23 @@ all.equal.data.table = function(target, current, trim.levels=TRUE, check.attribu k1 = key(target) k2 = key(current) if (!identical(k1, k2)) { - return(sprintf("Datasets has different keys. 'target'%s. 'current'%s.", - if(length(k1)) paste0(": ", paste(k1, collapse=", ")) else " has no key", - if(length(k2)) paste0(": ", paste(k2, collapse=", ")) else " has no key")) + return(gettextf( + "Datasets have different %s. 'target': %s. 'current': %s.", + "keys", + if(length(k1)) brackify(k1) else gettextf("has no key"), + if(length(k2)) brackify(k2) else gettextf("has no key") + )) } # check index i1 = indices(target) i2 = indices(current) if (!identical(i1, i2)) { - return(sprintf("Datasets has different indexes. 'target'%s. 'current'%s.", - if(length(i1)) paste0(": ", paste(i1, collapse=", ")) else " has no index", - if(length(i2)) paste0(": ", paste(i2, collapse=", ")) else " has no index")) + return(gettextf( + "Datasets have different %s. 'target': %s. 'current': %s.", + "indices", + if(length(i1)) brackify(i1) else gettextf("has no index"), + if(length(i2)) brackify(i2) else gettextf("has no index") + )) } # Trim any extra row.names attributes that came from some inheritance @@ -173,7 +179,7 @@ all.equal.data.table = function(target, current, trim.levels=TRUE, check.attribu a1 = exclude.attrs(attributes(target)) a2 = exclude.attrs(attributes(current)) if (length(a1) != length(a2)) return(sprintf("Datasets has different number of (non-excluded) attributes: target %s, current %s", length(a1), length(a2))) - if (!identical(nm1 <- sort(names(a1)), nm2 <- sort(names(a2)))) return(sprintf("Datasets has attributes with different names: %s", paste(setdiff(union(names(a1), names(a2)), intersect(names(a1), names(a2))), collapse=", "))) + if (!identical(nm1 <- sort(names(a1)), nm2 <- sort(names(a2)))) return(sprintf("Datasets has attributes with different names: %s", brackify(setdiff(union(names(a1), names(a2)), intersect(names(a1), names(a2)))))) attrs.r = all.equal(a1[nm1], a2[nm2], ..., check.attributes = check.attributes) if (is.character(attrs.r)) return(paste("Attributes: <", attrs.r, ">")) # skip further heavy processing } diff --git a/R/tables.R b/R/tables.R index bcfab0c674..b94441c626 100644 --- a/R/tables.R +++ b/R/tables.R @@ -8,7 +8,7 @@ tables = function(mb=TRUE, order.col="NAME", width=80, all_obj = objects(envir=env, all.names=TRUE) is_DT = which(vapply_1b(all_obj, function(x) is.data.table(get(x, envir=env)))) if (!length(is_DT)) { - if (!silent) cat("No objects of class data.table exist in", if (identical(env,.GlobalEnv)) ".GlobalEnv" else format(env), "\n") + if (!silent) catf("No objects of class data.table exist in %s\n", if (identical(env, .GlobalEnv)) ".GlobalEnv" else format(env)) return(invisible(data.table(NULL))) } DT_names = all_obj[is_DT] @@ -36,7 +36,7 @@ tables = function(mb=TRUE, order.col="NAME", width=80, tt[ , NCOL := pretty_format(NCOL, width=4L)] if (mb) tt[ , MB := pretty_format(MB, width=2L)] print(tt, class=FALSE, nrows=Inf) - if (mb) cat("Total: ", prettyNum(sum(info$MB), big.mark=","), "MB\n", sep="") + if (mb) catf("Total: %sMB\n", prettyNum(sum(info$MB), big.mark=",")) } invisible(info) } diff --git a/R/test.data.table.R b/R/test.data.table.R index c5da3e0bac..cf778c68b6 100644 --- a/R/test.data.table.R +++ b/R/test.data.table.R @@ -46,7 +46,7 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F # nocov start fn2 = paste0(fn,".bz2") if (!file.exists(file.path(fulldir, fn2))) - stop(gettextf("Neither %s nor %s exist in %s",fn, fn2, fulldir, domain="R-data.table")) + stop(domain=NA, gettextf("Neither %s nor %s exist in %s",fn, fn2, fulldir)) fn = fn2 # nocov end # sys.source() below accepts .bz2 directly. @@ -92,17 +92,15 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F cat("getDTthreads(verbose=TRUE):\n") # for tracing on CRAN; output to log before anything is attempted getDTthreads(verbose=TRUE) # includes the returned value in the verbose output (rather than dangling '[1] 4'); e.g. "data.table is using 4 threads" - cat("test.data.table() running:", fn, "\n") # print fn to log before attempting anything on it (in case it is missing); on same line for slightly easier grep + catf("test.data.table() running: %s\n", fn) # print fn to log before attempting anything on it (in case it is missing); on same line for slightly easier grep env = new.env(parent=.GlobalEnv) assign("testDir", function(x) file.path(fulldir, x), envir=env) # are R's messages being translated to a foreign language? #3039, #630 - txt = eval(parse(text="tryCatch(mean(not__exist__), error = function(e) e$message)"), envir=.GlobalEnv) - foreign = txt != "object 'not__exist__' not found" + foreign = gettext("object '%s' not found", domain="R") != "object '%s' not found" if (foreign) { # nocov start - cat("\n**** This R session's language is not English. Each test will still check that the correct number of errors and/or\n", - "**** warnings are produced. However, to test the text of each error/warning too, please restart R with LANGUAGE=en\n\n", sep="") + catf("\n**** This R session's language is not English. Each test will still check that the correct number of errors and/or\n**** warnings are produced. However, to test the text of each error/warning too, please restart R with LANGUAGE=en\n\n") # nocov end } assign("foreign", foreign, envir=env) @@ -162,8 +160,14 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F ntest = env$ntest if (nfail > 0L) { # nocov start - if (nfail > 1L) {s1="s";s2="s: "} else {s1="";s2=" "} - stop(nfail," error",s1," out of ",ntest,". Search ",names(fn)," for test number",s2,paste(env$whichfail,collapse=", "),".") + # domain=NA since it's already translated by then + stop(domain = NA, sprintf( + ngettext( + nfail, + "%d error out of %d. Search %s for test number %s", + "%d errors out of %d. Search %s for test numbers %s" + ), nfail, ntest, names(fn), paste(env$whichfail, collapse=", ") + )) # important to stop() here, so that 'R CMD check' fails # nocov end } @@ -172,12 +176,12 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F timings = env$timings DT = head(timings[-1L][order(-time)], 10L) # exclude id 1 as in dev that includes JIT if ((x<-sum(timings[["nTest"]])) != ntest) { - warning("Timings count mismatch:",x,"vs",ntest) # nocov + warning("Timings count mismatch: ",x," vs ",ntest) # nocov } - cat("10 longest running tests took ", as.integer(tt<-DT[, sum(time)]), "s (", as.integer(100*tt/(ss<-timings[,sum(time)])), "% of ", as.integer(ss), "s)\n", sep="") + catf("10 longest running tests took %ds (%d%% of %ds)\n", as.integer(tt<-DT[, sum(time)]), as.integer(100*tt/(ss<-timings[,sum(time)])), as.integer(ss)) print(DT, class=FALSE) - cat("All ",ntest," tests (last ",env$prevtest,") in ",names(fn)," completed ok in ",timetaken(env$started.at),"\n",sep="") + catf("All %d tests (last %s) in %s completed ok in %s\n", ntest, env$prevtest, names(fn), timetaken(env$started.at)) ## this chunk requires to include new suggested deps: graphics, grDevices #memtest.plot = function(.inittime) { @@ -211,10 +215,10 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F compactprint = function(DT, topn=2L) { tt = vapply_1c(DT,function(x)class(x)[1L]) tt[tt=="integer64"] = "i64" - tt = substring(tt, 1L, 3L) + tt = substr(tt, 1L, 3L) makeString = function(x) paste(x, collapse = ",") # essentially toString.default cn = paste0(" [Key=",makeString(key(DT)), - " Types=", makeString(substring(sapply(DT, typeof), 1L, 3L)), + " Types=", makeString(substr(sapply(DT, typeof), 1L, 3L)), " Classes=", makeString(tt), "]") if (nrow(DT)) { print(copy(DT)[,(cn):="",verbose=FALSE], topn=topn, class=FALSE) @@ -255,6 +259,7 @@ test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,no # iv) if warning is supplied, y is checked to equal x, and x should result in a warning message matching the pattern # v) if output is supplied, x is evaluated and printed and the output is checked to match the pattern # num just needs to be numeric and unique. We normally increment integers at the end, but inserts can be made using decimals e.g. 10,11,11.1,11.2,12,13,... + # num=0 to escape global failure tracking so we can test behaviour of test function itself: test(1.1, test(0, TRUE, FALSE), FALSE, output="1 element mismatch") # Motivations: # 1) we'd like to know all tests that fail not just stop at the first. This often helps by revealing a common feature across a set of # failing tests @@ -268,7 +273,7 @@ test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,no prevtest = get("prevtest", parent.frame()) nfail = get("nfail", parent.frame()) # to cater for both test.data.table() and stepping through tests in dev whichfail = get("whichfail", parent.frame()) - assign("ntest", get("ntest", parent.frame()) + 1L, parent.frame(), inherits=TRUE) # bump number of tests run + assign("ntest", get("ntest", parent.frame()) + if (num>0) 1L else 0L, parent.frame(), inherits=TRUE) # bump number of tests run lasttime = get("lasttime", parent.frame()) timings = get("timings", parent.frame()) memtest = get("memtest", parent.frame()) @@ -277,14 +282,15 @@ test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,no foreign = get("foreign", parent.frame()) showProgress = get("showProgress", parent.frame()) time = nTest = NULL # to avoid 'no visible binding' note - on.exit( { + if (num>0) on.exit( { now = proc.time()[3L] took = now-lasttime # so that prep time between tests is attributed to the following test assign("lasttime", now, parent.frame(), inherits=TRUE) timings[ as.integer(num), `:=`(time=time+took, nTest=nTest+1L), verbose=FALSE ] } ) if (showProgress) - cat("\rRunning test id", numStr, " ") # nocov. + # \r can't be in gettextf msg + cat("\rRunning test id", numStr, " ") # nocov. # See PR #4090 for comments about change here in Dec 2019. # If a segfault error occurs in future and we'd like to know after which test, then arrange for the # try(sys.source()) in test.data.table() to be run in a separate R process. That process could write out @@ -338,10 +344,10 @@ test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,no fwrite(mem, "memtest.csv", append=TRUE, verbose=FALSE) # nocov } fail = FALSE - if (.test.data.table) { + if (.test.data.table && num>0) { if (num>\n",sep="") # \n printed as '\\n' so the two lines of output can be compared vertically - cat("Observed: <<",gsub("\n","\\\\n",out),">>\n",sep="") + catf("Test %s did not produce correct output:\n", numStr) + catf("Expected: <<%s>>\n", encodeString(output)) # \n printed as '\\n' so the two lines of output can be compared vertically + catf("Observed: <<%s>>\n", encodeString(out)) fail = TRUE # nocov end } if (length(notOutput) && string_match(notOutput, out, ignore.case=TRUE)) { # nocov start - cat("Test",numStr,"produced output but should not have:\n") - cat("Expected absent (case insensitive): <<",gsub("\n","\\\\n",notOutput),">>\n",sep="") - cat("Observed: <<",gsub("\n","\\\\n",out),">>\n",sep="") + catf("Test %s produced output but should not have:\n", numStr) + catf("Expected absent (case insensitive): <<%s>>\n", encodeString(notOutput)) + catf("Observed: <<%s>>\n", encodeString(out)) fail = TRUE # nocov end } @@ -411,7 +413,7 @@ test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,no if (is.data.table(x) && is.data.table(y)) { if (!selfrefok(x) || !selfrefok(y)) { # nocov start - cat("Test ",numStr," ran without errors but selfrefok(", if(!selfrefok(x))"x"else"y", ") is FALSE\n", sep="") + catf("Test %s ran without errors but selfrefok(%s) is FALSE\n", numStr, if (selfrefok(x)) "y" else "x") fail = TRUE # nocov end } else { @@ -434,12 +436,12 @@ test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,no # For test 617 on r-prerel-solaris-sparc on 7 Mar 2013 # nocov start if (!fail) { - cat("Test", numStr, "ran without errors but failed check that x equals y:\n") + catf("Test %s ran without errors but failed check that x equals y:\n", numStr) failPrint = function(x, xsub) { cat(">", substitute(x), "=", xsub, "\n") if (is.data.table(x)) compactprint(x) else { nn = length(x) - cat(sprintf("First %d of %d (type '%s'): \n", min(nn, 6L), length(x), typeof(x))) + catf("First %d of %d (type '%s'): \n", min(nn, 6L), length(x), typeof(x)) # head.matrix doesn't restrict columns if (length(d <- dim(x))) do.call(`[`, c(list(x, drop = FALSE), lapply(pmin(d, 6L), seq_len))) else print(head(x)) @@ -452,7 +454,7 @@ test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,no } # nocov end } - if (fail && .test.data.table) { + if (fail && .test.data.table && num>0) { # nocov start assign("nfail", nfail+1L, parent.frame(), inherits=TRUE) assign("whichfail", c(whichfail, numStr), parent.frame(), inherits=TRUE) diff --git a/R/utils.R b/R/utils.R index 42e67ea8de..7a698131c6 100644 --- a/R/utils.R +++ b/R/utils.R @@ -25,6 +25,13 @@ if (base::getRversion() < "3.2.0") { # Apr 2015 isNamespaceLoaded = function(x) x %chin% loadedNamespaces() } +if (!exists('startsWith', 'package:base', inherits=FALSE)) { # R 3.3.0; Apr 2016 + startsWith = function(x, stub) substr(x, 1L, nchar(stub))==stub +} +if (!exists('endsWith', 'package:base', inherits=FALSE)) { + endsWith = function(x, stub) {n=nchar(x); substr(x, n-nchar(stub)+1L, n)==stub} +} + # which.first which.first = function(x) { @@ -45,7 +52,7 @@ which.last = function(x) require_bit64_if_needed = function(DT) { # called in fread and print.data.table - if (!isNamespaceLoaded("bit64") && any(sapply(DT,inherits,"integer64"))) { + if (!isNamespaceLoaded("bit64") && any(vapply_1b(DT, inherits, "integer64"))) { # nocov start # a test was attempted to cover the requireNamespace() by using unloadNamespace() first, but that fails when nanotime is loaded because nanotime also uses bit64 if (!requireNamespace("bit64",quietly=TRUE)) { @@ -84,7 +91,7 @@ name_dots = function(...) { } notnamed = vnames=="" if (any(notnamed)) { - syms = sapply(dot_sub, is.symbol) # save the deparse() in most cases of plain symbol + syms = vapply_1b(dot_sub, is.symbol) # save the deparse() in most cases of plain symbol for (i in which(notnamed)) { tmp = if (syms[i]) as.character(dot_sub[[i]]) else deparse(dot_sub[[i]])[1L] if (tmp == make.names(tmp)) vnames[i]=tmp @@ -101,27 +108,32 @@ brackify = function(x, quote=FALSE) { # keep one more than needed to trigger dots if needed if (quote && is.character(x)) x = paste0("'",head(x,CUTOFF+1L),"'") if (length(x) > CUTOFF) x = c(x[1:CUTOFF], '...') - sprintf('[%s]', paste(x, collapse = ', ')) + sprintf('[%s]', toString(x)) } # patterns done via NSE in melt.data.table and .SDcols in `[.data.table` -do_patterns = function(pat_sub, all_cols) { - # received as substitute(patterns(...)) - pat_sub = as.list(pat_sub)[-1L] - # identify cols = argument if present - idx = which(names(pat_sub) == "cols") - if (length(idx)) { - cols = eval(pat_sub[["cols"]], parent.frame(2L)) - pat_sub = pat_sub[-idx] - } else cols = all_cols - pats = lapply(pat_sub, eval, parent.frame(2L)) - matched = patterns(pats, cols=cols) - # replace with lengths when R 3.2.0 dependency arrives - if (length(idx <- which(sapply(matched, length) == 0L))) - stop('Pattern', if (length(idx) > 1L) 's', ' not found: [', - paste(pats[idx], collapse = ', '), ']') - - return(matched) +# was called do_patterns() before PR#4731 +eval_with_cols = function(orig_call, all_cols) { + parent = parent.frame(2L) + fun_uneval = orig_call[[1L]] + # take fun from either calling env (parent) or from data.table + fun = tryCatch({ + maybe_fun = eval(fun_uneval, parent) + # parent env could have a non-function with this name, which we + # should ignore. + stopifnot(is.function(maybe_fun)) + maybe_fun + }, error=function(e) { + eval(fun_uneval)#take function from data.table namespace. + }) + if (!is.primitive(fun)) { + named_call = match.call(fun, orig_call) + if ("cols" %in% names(formals(fun)) && !"cols" %in% names(named_call)) { + named_call[["cols"]] = all_cols + } + named_call[[1L]] = fun + eval(named_call, parent) + } } # check UTC status @@ -140,3 +152,8 @@ edit.data.table = function(name, ...) { setDT(NextMethod('edit', name))[] } # nocov end + +catf = function(fmt, ...) { + cat(gettextf(fmt, ...)) +} + diff --git a/R/xts.R b/R/xts.R index bfb6f813a7..fce6aad3b5 100644 --- a/R/xts.R +++ b/R/xts.R @@ -7,8 +7,8 @@ as.data.table.xts = function(x, keep.rownames = TRUE, key=NULL, ...) { r = setDT(as.data.frame(x, row.names=NULL)) if (identical(keep.rownames, FALSE)) return(r[]) index_nm = if (is.character(keep.rownames)) keep.rownames else "index" - if (index_nm %chin% names(x)) stop(gettextf("Input xts object should not have '%s' column because it would result in duplicate column names. Rename '%s' column in xts or use `keep.rownames` to change the index column name.", index_nm, index_nm, domain="R-data.table"), domain=NA) - r[, c(index_nm) := zoo::index(x)] + if (index_nm %chin% names(x)) stop(domain=NA, gettextf("Input xts object should not have '%s' column because it would result in duplicate column names. Rename '%s' column in xts or use `keep.rownames` to change the index column name.", index_nm, index_nm)) + r[, c(index_nm) := zoo::index(x), env=list(x=x)] setcolorder(r, c(index_nm, setdiff(names(r), index_nm))) # save to end to allow for key=index_nm setkeyv(r, key) @@ -19,7 +19,7 @@ as.xts.data.table = function(x, ...) { stopifnot(requireNamespace("xts"), !missing(x), is.data.table(x)) if (!xts::is.timeBased(x[[1L]])) stop("data.table must have a time based column in first position, use `setcolorder` function to change the order, or see ?timeBased for supported types") colsNumeric = vapply_1b(x, is.numeric)[-1L] # exclude first col, xts index - if (any(!colsNumeric)) warning("Following columns are not numeric and will be omitted: ", brackify(names(colsNumeric)[!colsNumeric])) + if (!all(colsNumeric)) warning("Following columns are not numeric and will be omitted: ", brackify(names(colsNumeric)[!colsNumeric])) r = setDF(x[, .SD, .SDcols = names(colsNumeric)[colsNumeric]]) return(xts::as.xts(r, order.by = if ("IDate" %chin% class(x[[1L]])) as.Date(x[[1L]]) else x[[1L]])) } diff --git a/_pkgdown.yml b/_pkgdown.yml index 6d2ef397d3..4b02b39491 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -36,8 +36,10 @@ navbar: href: articles/datatable-keys-fast-subset.html - text: "Secondary indices and auto indexing" href: articles/datatable-secondary-indices-and-auto-indexing.html - - text: "Efficient reshaping using data.tables" + - text: "Efficient reshaping using data.table" href: articles/datatable-reshape.html + - text: "Programming on data.table" + href: articles/datatable-programming.html - text: "Frequently asked questions" href: articles/datatable-faq.html - text: "Importing data.table" diff --git a/inst/include/datatableAPI.h b/inst/include/datatableAPI.h index 44f52018f4..e2a1b2fd32 100644 --- a/inst/include/datatableAPI.h +++ b/inst/include/datatableAPI.h @@ -21,11 +21,14 @@ extern "C" { /* provided the interface for the function exported in ../src/init.c via R_RegisterCCallable() */ +// subsetDT #3751 inline SEXP attribute_hidden DT_subsetDT(SEXP x, SEXP rows, SEXP cols) { static SEXP(*fun)(SEXP, SEXP, SEXP) = - (SEXP(*)(SEXP,SEXP,SEXP)) R_GetCCallable("data.table", "CsubsetDT"); + (SEXP(*)(SEXP,SEXP,SEXP)) R_GetCCallable("data.table", "DT_subsetDT"); return fun(x,rows,cols); } +// forder #4015 +// setalloccol alloccolwrapper setDT #4439 /* permit opt-in to redefine shorter identifiers */ #if defined(DATATABLE_REMAP_API) diff --git a/inst/tests/benchmark.Rraw b/inst/tests/benchmark.Rraw index 1c8bf146a6..bf0bf77e9f 100644 --- a/inst/tests/benchmark.Rraw +++ b/inst/tests/benchmark.Rraw @@ -161,10 +161,10 @@ set.seed(1) L = lapply(1:1e6, sample, x=100, size=2) x = capture.output(fwrite(L)) test(1742.1, nchar(x), c(2919861L, 2919774L)) # tests 2 very long lines, too -test(1742.2, substring(x,1,10), c("27,58,21,9","38,91,90,6")) +test(1742.2, substr(x, 1L, 10L), c("27,58,21,9", "38,91,90,6")) test(1742.3, L[[1L]], c(27L,38L)) test(1742.4, L[[1000000L]], c(76L, 40L)) -test(1742.5, substring(x,nchar(x)-10,nchar(x)), c("50,28,95,76","62,87,23,40")) +test(1742.5, substr(x, nchar(x)-10L, nchar(x)), c("50,28,95,76","62,87,23,40")) # Add scaled-up non-ASCII forder test 1896 diff --git a/inst/tests/other.Rraw b/inst/tests/other.Rraw index 1bd91286f9..03d62b4389 100644 --- a/inst/tests/other.Rraw +++ b/inst/tests/other.Rraw @@ -186,8 +186,6 @@ if (loaded[["parallel"]]) { } # example(":=", local=TRUE) triggered cedta==FALSE and then error, #2972 -res = tryCatch(example(':=', package='data.table', local=TRUE)) -test(14.1, !inherits(res, 'error')) -res = tryCatch(example('CJ', package='data.table', local=TRUE)) -test(14.2, !inherits(res, 'error')) +test(14.1, {example(':=', package='data.table', local=TRUE); TRUE}) +test(14.2, {example('CJ', package='data.table', local=TRUE); TRUE}) diff --git a/inst/tests/programming.Rraw b/inst/tests/programming.Rraw new file mode 100644 index 0000000000..88c6a99e6f --- /dev/null +++ b/inst/tests/programming.Rraw @@ -0,0 +1,600 @@ +require(methods) +if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { + if ((tt<-compiler::enableJIT(-1))>0) + cat("This is dev mode and JIT is enabled (level ", tt, ") so there will be a brief pause around the first test.\n", sep="") +} else { + require(data.table) + test = data.table:::test + is.AsIs = data.table:::is.AsIs + rm.AsIs = data.table:::rm.AsIs + enlist = data.table:::enlist + list2lang = data.table:::list2lang +} + +# test that 'test' catches the difference in language object +cl1 = substitute(f(1L, list(2L))) +cl2 = substitute(f(1L, .v), list(.v=list(2L))) +test(1.01, all.equal(cl1, cl2), TRUE) +test(1.02, identical(cl1, cl2), FALSE) +test(1.03, test(0, cl1, cl2), FALSE, output="f(1L, list(2L))") +# AsIs +test(1.11, is.AsIs(1L), FALSE) +test(1.12, is.AsIs(I(1L)), TRUE) +test(1.13, is.AsIs("a"), FALSE) +test(1.14, is.AsIs(I("a")), TRUE) +test(1.15, is.AsIs(list(1L)), FALSE) +test(1.16, is.AsIs(I(list(1L))), TRUE) +test(1.17, is.AsIs(structure(list(NULL), class="an_S3")), FALSE) ## S3 +test(1.18, is.AsIs(I(structure(list(NULL), class="an_S3"))), TRUE) +test(1.19, is.AsIs(getClass("MethodDefinition")), FALSE) ## S4 +test(1.20, is.AsIs(suppressWarnings(I(getClass("MethodDefinition")))), TRUE) ## suppressWarnings due new warning in R 4.1 +test(1.21, is.AsIs(rm.AsIs(1L)), FALSE) +test(1.22, is.AsIs(rm.AsIs(I(1L))), FALSE) +test(1.23, is.AsIs(rm.AsIs(list(1L))), FALSE) +test(1.24, is.AsIs(rm.AsIs(I(list(1L)))), FALSE) + +# substitute2 simple +test(2.01, substitute2(list(var = val), env = list(var="my_var", val=5L)), quote(list(my_var = 5L))) +# substitute2 + I to handle char and symbol +test(2.02, substitute2(list(var = val), env = list(var="my_var", val=I("my_val"))), quote(list(my_var="my_val"))) +test(2.03, substitute2(list(var = val), env = I(list(var=as.name("my_var"), val="my_val"))), quote(list(my_var="my_val"))) +# substitute2 handle symbol anyway +test(2.04, substitute2(list(var = val), env = list(var=as.name("my_var"), val=I("my_val"))), quote(list(my_var="my_val"))) +# substitute2 complex use case +test(2.11, substitute2( + .(fun_ans_var = fun(farg1, farg2=farg2val), timestamp=Sys.time(), col_head = head(head_arg, n=1L)), + list( + fun_ans_var = "my_mean_res", + fun = "mean", + farg1 = "my_x_col", + farg2 = "na.rm", + farg2val = TRUE, + col_head = "first_y", + head_arg = "y" + ) +), quote(.(my_mean_res=mean(my_x_col, na.rm=TRUE), timestamp=Sys.time(), first_y=head(y, n=1L)))) +# substitute2 PR example +test(2.12, substitute2( + .(out_col_name = fun(in_col_name, fun_arg1=fun_arg1val)), + env = list( + in_col_name = "x", + fun = "sum", + fun_arg1 = "na.rm", + fun_arg1val = TRUE, + out_col_name = "sum_x" + ) +), quote(.(sum_x = sum(x, na.rm=TRUE)))) +# substitute2 nested calls argument names substitute +test(2.13, substitute2( + f1(a1 = f2(a2 = f3(a3 = f4(a4 = v1, extra=v2), v3, a3b = v4)), a1b=c("a","b")), + list(f1="fun1", f2="fun2", f3="fun3", f4="fun4", a1="arg1", a2="arg2", a3="arg3", a4="arg4", v1="col1", extra="n", v2=6L, v3="col2", a3b="arg3b", v4=c(3.5,4.5), a1b="arg1b") +), substitute( + fun1(arg1 = fun2(arg2 = fun3(arg3 = fun4(arg4 = col1, n=6L), col2, arg3b = v4)), arg1b=c("a","b")), + list(v4=c(3.5,4.5)) +)) +# calls of length 0 args +const1 = function() 1L +test(2.21, substitute2(list(nm = fun()), env=list(a="b", fun="const1", nm="int1")), quote(list(int1=const1()))) +test(2.22, substitute2(.(), env=list(a="b", fun="const1", nm="int1")), quote(.())) +test(2.23, identical(substitute2(), substitute())) +# substitute2 AsIs class properly removed or kept +test(2.31, class(substitute2(var3%in%values, list(var3="a", values=I(c("a","b","c"))))[[3L]]), "character") +test(2.32, class(substitute2(var3%in%values, I(list(var3=as.name("a"), values=c("a","b","c"))))[[3L]]), "character") +test(2.33, class(substitute2(var3%in%values, list(var3="a", values=I(1:3)))[[3L]]), "integer") +test(2.34, class(substitute2(var3%in%values, I(list(var3=as.name("a"), values=c(1:3))))[[3L]]), "integer") +cl = substitute2(var3%in%values, I(list(var3=as.name("a"), values=I(c("a","b","c"))))) ## keeping AsIs by extra I on whole env arg +test(2.35, cl, substitute(a %in% .v, list(.v=I(c("a","b","c"))))) +test(2.36, class(cl[[3L]]), "AsIs") +cl = substitute2(var3%in%values, I(list(var3="a", values=I(1:3)))) +test(2.37, cl, substitute("a" %in% .v, list(.v=I(1:3)))) +test(2.38, class(cl[[3L]]), "AsIs") +# substitute2 non-scalar char as name +test(2.41, substitute2(list(var = val), env = list(var="my_var", val=c("a","b"))), error="are not scalar") +test(2.42, substitute2(list(var = val), env = list(var="my_var", val=I(c("a","b")))), substitute(list(my_var=.v), list(.v=c("a","b")))) ## note that quote(list(my_var=c("a","b")))) will not work because 'c("a","b")' will be a 'language' class (a 'c()' call), but we need to have it as 'character' class instead +test(2.43, substitute2(list(var = val), env = I(list(var=as.name("my_var"), val=c("a","b")))), substitute(list(my_var=.v), list(.v=c("a","b")))) +# substitute2 non-symbol +test(2.44, substitute2(list(var = val), env = list(var=I("my_var"), val="my_val")), error="type 'character' but it has to be 'symbol'") +test(2.45, substitute2(list(var = val), env = I(list(var="my_var", val="my_val"))), error="type 'character' but it has to be 'symbol'") +test(2.46, substitute2(.(v1=v2), list(v1=1L, v2=2L)), error="type 'integer' but it has to be 'symbol'") +test(2.47, substitute2(.(v1=v2), list(v1=FALSE, v2=2L)), error="type 'logical' but it has to be 'symbol'") +# substitute2 NA_character_ becomes valid 'NA' name +test(2.48, substitute2(.(v1 = v2), list(v1 = NA_character_, v2 = NA_character_, "." = "list")), quote(list(`NA` = `NA`))) +cl = substitute2(.(v1 = v2), list(v1 = NA_character_, v2 = I(NA_character_), "." = "list")) +test(2.49, cl, quote(list(`NA` = NA_character_))) +test(2.50, eval(cl), list("NA" = NA_character_)) +# substitute2 duplicate matches +test(2.51, substitute2(list(v1=v2, v1=v2), env=list(v1="nm",v2=2L,v3=3L)), quote(list(nm = 2L, nm = 2L))) +test(2.52, substitute2(list(v1=v2, v1=v3), env=list(v1="nm",v2=2L,v3=3L)), quote(list(nm = 2L, nm = 3L))) +# substitute2 nested unnamed call +test(2.53, substitute2(c(list(v1=v2, v1=v2)), env=list(v1="nm",v2=2L,v3=3L)), quote(c(list(nm = 2L, nm = 2L)))) +test(2.54, substitute2(c(list(v1=v2, v1=v3)), env=list(v1="nm",v2=2L,v3=3L)), quote(c(list(nm = 2L, nm = 3L)))) + +# substitute2 env as environment class +e = as.environment(list(v=1L, .v=2L)) +test(2.81, substitute2(.(v, .v), e), quote(.(1L, 2L))) +# unline in base R substitute, the env arg is always evaluated +e = new.env() +delayedAssign("a_promise", stop("I am the error"), assign.env=e) +e$x = 5L +promises = function(env) { + f = function(x, env) eval(substitute(substitute(.x, env), list(.x=x))) + sym = lapply(setNames(nm=ls(env)), as.name) + lapply(sym, f, env) +} +test(2.820, promises(e), list(a_promise=quote(stop("I am the error")), x=5L)) +test(2.821, substitute(x + 1L, e), quote(5L + 1L)) +test(2.822, substitute2(x + 1L, e), error="I am the error", ignore.warning="restarting interrupted promise evaluation") +# substitute2 env various corner cases +test(2.901, substitute2(.(v), NULL), quote(.(v))) +test(2.902, substitute2(.(v), list()), quote(.(v))) +test(2.903, substitute2(.(v), emptyenv()), quote(.(v))) +test(2.91, substitute2(.()), error="'env' must not be missing") +test(2.92, substitute2(v, c(v=1L)), error="'env' must be a list or an environment") +test(2.93, substitute2(.(v), list(1L, 2L)), error="'env' argument does not have names") +test(2.94, substitute2(.(v), structure(list(1L,2L), names=c("","v"))), error="'env' argument has zero char names") +test(2.95, substitute2(.(v), structure(list(1,2), names=c(NA,"v"))), error="'env' argument has NA names") +test(2.96, substitute2(.(v), list(v=1,v=2)), error="'env' argument has duplicated names") + +# substitute2 re-use inside another function +f = function(expr, env) { + eval(substitute( + substitute2(.expr, env), + list(.expr = substitute(expr)) + )) +} +cl = f( + .(out_col_name = fun(in_col_name, fun_arg1=fun_arg1val)), + env = list( + in_col_name = "x", + fun = "sum", + fun_arg1 = "na.rm", + fun_arg1val = TRUE, + out_col_name = "sum_x" + ) +) +test(3.01, cl, quote(.(sum_x = sum(x, na.rm = TRUE)))) +# substitute2 nested re-use inside another function +cl = substitute2(list(nm = fun(.(out_col_name = fun(in_col_name, fun_arg1=fun_arg1val)), + env = list( + in_col_name = "x", + fun = "sum", + fun_arg1 = "na.rm", + fun_arg1val = tf_var, ## note a parameter here + out_col_name = "sum_x" +))), list(nm="my_call", fun="f", tf_var=FALSE)) +test(3.02, eval(cl), list(my_call = quote(.(sum_x = sum(x, na.rm = FALSE))))) + +# enlist +test(4.01, enlist(c("a")), error="'x' must be a list") +test(4.02, enlist(list("V1","V2")), quote(list(V1, V2))) +test(4.03, enlist(list(V1="V1", V2="V2")), quote(list(V1=V1, V2=V2))) +test(4.04, enlist(I(list(V1="V1", V2="V2"))), list(V1="V1", V2="V2")) +test(4.05, enlist(list(V1=I("V1"), V2=I("V2"))), quote(list(V1="V1", V2="V2"))) +test(4.06, enlist(list(V1="V1", V2=I("V2"))), quote(list(V1=V1, V2="V2"))) +test(4.07, enlist(list(V1="V1", V2=I("V2"), V3=list("X1", "X2"))), quote(list(V1=V1, V2="V2", V3=list(X1, X2)))) +test(4.08, enlist(list(V1="V1", V2=I("V2"), V3=list(X1="X1", X2=I("X2")))), quote(list(V1=V1, V2="V2", V3=list(X1=X1, X2="X2")))) +test(4.09, enlist(list(V1="V1", V2=I("V2"), V3=enlist(list("X1","X2")))), quote(list(V1 = V1, V2 = "V2", V3 = list(X1, X2)))) +test(4.10, enlist(list(V1="V1", V2=I("V2"), V3=I(enlist(list("X1","X2"))))), quote(list(V1 = V1, V2 = "V2", V3 = list(X1, X2)))) +test(4.11, enlist(list(V1="V1", V2=I("V2"), V3=enlist(I(list("X1","X2"))))), quote(list(V1 = V1, V2 = "V2", V3 = list(X1, X2)))) +test(4.12, enlist(list(V1="V1", V2=I("V2"), V3=I(enlist(I(list("X1","X2")))))), substitute(list(V1 = V1, V2 = "V2", V3 = lst), list(lst = list("X1", "X2")))) +test(4.13, enlist(list(V1="V1", V2=I("V2"), V3=I(enlist(list(I("X1"),I("X2")))))), quote(list(V1 = V1, V2 = "V2", V3 = list("X1", "X2")))) +test(4.14, enlist(I(list(V1="V1", V2=list("V2")))), list(V1="V1", V2=list("V2"))) +test(4.15, enlist(I(list(V1="V1", V2=I(list("V2"))))), list(V1="V1", V2=I(list("V2")))) + +# list2lang +test(5.01, list2lang(c("a")), error="'x' must be a list") +test(5.02, list2lang(list("a", 1L)), list(as.name("a"), 1L)) +test(5.03, list2lang(I(list("a", 1L))), list("a", 1L)) +test(5.04, list2lang(list(I("a"), 1L)), list("a", 1L)) +test(5.05, list2lang(list("a", 1L, list("b"))), list(as.name("a"), 1L, call("list", as.name("b")))) +test(5.06, list2lang(list("a", 1L, list(I("b")))), list(as.name("a"), 1L, call("list", "b"))) +test(5.07, list2lang(list("a", 1L, I(list("b")))), list(as.name("a"), 1L, list("b"))) +test(5.08, list2lang(I(list("a", 1L, list("b")))), list("a", 1L, list("b"))) +test(5.09, list2lang(I(list("a", 1L, I(list("b"))))), list("a", 1L, I(list("b")))) +test(5.10, list2lang(list("a", 1L, c(1L, 2L))), list(as.name("a"), 1L, c(1L,2L))) ## no 'enlist' like feature for 'c()' function, see next test +test(5.11, list2lang(list("a", 1L, call("c", 1L, 2L))), list(as.name("a"), 1L, quote(c(1L, 2L)))) + +# datatable.enlist +op = options(datatable.enlist=NULL) +test(6.01, + substitute2(list(v1 = v2, v3 = v4), list(v1 = "int", v2 = 1L, v3 = "lst", v4 = list("a", "b", list("c", "d")))), + quote(list(int = 1L, lst = list(a, b, list(c, d))))) +options(datatable.enlist=FALSE) +test(6.02, + substitute2(list(v1 = v2, v3 = v4), list(v1 = "int", v2 = 1L, v3 = "lst", v4 = list("a", "b", list("c", "d")))), + substitute(list(int = 1L, lst = lst), list(lst = list("a", "b", list("c", "d"))))) +options(datatable.enlist=NULL) +test(6.03, + enlist(list(v1 = 1L, v2 = list(v3 = "b", v4 = list(v5 = "c")))), + quote(list(v1 = 1L, v2 = list(v3 = b, v4 = list(v5 = c))))) +options(datatable.enlist=FALSE) +test(6.04, + enlist(list(v1 = 1L, v2 = list(v3 = "b", v4 = list(v5 = "c")))), + substitute(list(v1 = 1L, v2 = lst), list(lst=list(v3 = "b", v4 = list(v5 = "c"))))) +options(datatable.enlist=NULL) +test(6.05, + substitute2(list(v1, v2, v3), list(v1="V1", v2="V2", v3=enlist(list("V4","V5")))), + quote(list(V1, V2, list(V4, V5)))) +options(datatable.enlist=FALSE) +test(6.06, + substitute2(list(v1, v2, v3), list(v1="V1", v2="V2", v3=enlist(list("V4","V5")))), + quote(list(V1, V2, list(V4, V5)))) +test(6.07, + substitute2(list(v1, v2, v3), list(v1="V1", v2="V2", v3=enlist(list("V4","V5", list("V6"))))), + substitute(list(V1, V2, list(V4, V5, lst)), list(lst=list("V6")))) +test(6.08, + substitute2(list(v1, v2, v3), list(v1="V1", v2="V2", v3=enlist(list("V4","V5", enlist(list("V6")))))), + quote(list(V1, V2, list(V4, V5, list(V6))))) +options(op) + +# documentation examples +test(7.01, substitute2(list(var1 = var2), list(var1 = "c1", var2 = 5L)), quote(list(c1 = 5L))) ## works also on names +test(7.02, substitute2(var1, list(var1 = I("c1"))), "c1") ## enforce character with I +test(7.03, substitute2(var1, list(var1 = "c1")), quote(c1)) ## turn character into symbol, for convenience +test(7.04, substitute2(list(var1 = var2), list(var1 = "c1", var2 = I("some_character"))), quote(list(c1 = "some_character"))) ## mix symbols and characters +test(7.05, substitute2(list(var1 = var2), I(list(var1 = as.name("c1"), var2 = "some_character"))), quote(list(c1 = "some_character"))) +test(7.06, substitute2(f(lst), I(list(lst = list(1L, 2L)))), substitute(f(lst), list(lst=list(1L,2L)))) ## list elements are enlist'ed into list calls +test(7.07, substitute2(f(lst), list(lst = I(list(1L, 2L)))), substitute(f(lst), list(lst=list(1L,2L)))) +test(7.08, substitute2(f(lst), list(lst = call("list", 1L, 2L))), quote(f(list(1L, 2L)))) +test(7.09, substitute2(f(lst), list(lst = list(1L, 2L))), quote(f(list(1L, 2L)))) +test(7.10, substitute2(f(lst), list(lst = list(1L, list(2L)))), quote(f(list(1L, list(2L))))) ## character to name and list into list calls works recursively +test(7.11, substitute2(f(lst), I(list(lst = list(1L, list(2L))))), substitute(f(lst), list(lst=list(1L, list(2L))))) +f = function(expr, env) { ## using substitute2 from another function + eval(substitute( + substitute2(.expr, env), + list(.expr = substitute(expr)) + )) +} +test(7.12, f(list(var1 = var2), list(var1 = "c1", var2 = 5L)), quote(list(c1 = 5L))) + +# data.table i, j, by +d = data.table(a = 2:1, b = 1:4) +test(11.01, d[var3%in%values, .(var1 = f(var2)), by=var3, + env=list(var1="res", var2="b", f="sum", var3="a", values=0:3), + verbose=TRUE], data.table(a=c(2L,1L), res=c(4L,6L)), output=c("Argument 'by' after substitute: a","Argument 'j' after substitute: .(res = sum(b))","Argument 'i' after substitute: a %in% 0:3")) +# data.table symbols and chars +d = data.table(a = c("b","a"), b = 1:4) +out = capture.output(ans <- d[var3%in%values, .(var1 = f(var2)), keyby=var3, + env=list(var1="res", var2="b", f="sum", var3="a", values=I(c("a","b","c"))), + verbose=TRUE]) # could not use output arg in test, so test it manually +test(11.02, ans, data.table(a=c("a","b"), res=c(6L,4L), key="a")) +out = grep("Argument.*substitute", out, value=TRUE) +test(11.021, length(out), 3L) # we expect i, j, by only here, ensure about that +test(11.022, "Argument 'by' after substitute: a" %in% out, TRUE) +test(11.023, "Argument 'j' after substitute: .(res = sum(b))" %in% out, TRUE) +test(11.024, "Argument 'i' after substitute: a %in% c(\"a\", \"b\", \"c\")" %in% out, TRUE) +out = capture.output(ans <- d[var3%in%values, .(var1 = f(var2)), keyby=var3, + env=I(list(var1=as.name("res"), var2=as.name("b"), f=as.name("sum"), var3=as.name("a"), values=c("b","c"))), + verbose=TRUE]) +test(11.03, ans, data.table(a=c("b"), res=c(4L), key="a")) +out = grep("Argument.*substitute", out, value=TRUE) +test(11.031, length(out), 3L) +test(11.032, "Argument 'by' after substitute: a" %in% out, TRUE) +test(11.033, "Argument 'j' after substitute: .(res = sum(b))" %in% out, TRUE) +test(11.034, "Argument 'i' after substitute: a %in% c(\"b\", \"c\")" %in% out, TRUE) +# substitute2 during join +d1 = data.table(id1=1:4, v1=5) +d2 = data.table(id1=c(0L,2:3), v1=6) +out = capture.output(ans <- d1[d2, on="id1<=id1", .(c1, c2, c3, c4), env=list(c1="x.id1", c2="i.id1", c3="x.v1", c4="i.v1"), verbose=TRUE]) +test(11.041, ans, data.table(x.id1=c(NA,1:2,1:3), i.id1=c(0L,2L,2L,3L,3L,3L), x.v1=c(NA,rep(5,5)), i.v1=rep(6,6))) +out = grep("Argument.*substitute", out, value=TRUE) +test(11.042, length(out), 2L) ## 2L because i is non-missing attempt to substitute is made +test(11.043, "Argument 'j' after substitute: .(x.id1, i.id1, x.v1, i.v1)" %in% out, TRUE) +d1 = data.table(id1=c(2L,4L,2L,4L), v1=5) +d2 = data.table(id1=c(0L,2:3), v1=6) +out = capture.output(ans <- d1[dd, on="id1<=id1", .(sum(c3), sum(c4)), by=by, env=list(dd="d2", c3="x.v1", c4="i.v1", by=".EACHI"), verbose=TRUE]) +test(11.044, ans, data.table(id1=c(0L,2L,3L), V1=c(NA,10,10), V2=c(6,6,6))) +out = grep("Argument.*substitute", out, value=TRUE) +test(11.045, length(out), 3L) +test(11.046, "Argument 'by' after substitute: .EACHI" %in% out, TRUE) +test(11.047, "Argument 'j' after substitute: .(sum(x.v1), sum(i.v1))" %in% out, TRUE) +test(11.048, "Argument 'i' after substitute: d2" %in% out, TRUE) +dt1 = data.table(x = letters[1:5], y = 1:5) +dt2 = data.table(x = letters[1:3], y = 11:13) +target_v = "y" +source_v = paste0("i.", target_v) +on_v = "x" +out = capture.output(invisible(dt1[dt2, target_v := source_v, on = on_v, env = list(target_v = target_v, source_v = source_v), verbose=TRUE])) +out = grep("Argument.*substitute", out, value=TRUE) +test(11.049, length(out), 2L) +test(11.050, dt1, data.table(x = c("a", "b", "c", "d", "e"), y = c(11L, 12L, 13L, 4L, 5L))) +# substitute special symbols +d = data.table(V1=1:2, V2=1:4) +test(11.051, d[, j, by, env=list(j=".N", by="V1")], data.table(V1=c(1L,2L), N=c(2L,2L))) +test(11.052, d[, j, by, env=list(j=".SD", by="V1")], data.table(V1=c(1L,1L,2L,2L), V2=c(1L,3L,2L,4L))) +test(11.053, d[, j, env=I(list(j=as.name(".N")))], 4L) +test(11.054, d[, .(op, fun(col)), by=by, env=list(op=".N", fun="sum", col="V2", by="V1")], data.table(V1=1:2, N=c(2L,2L), V2=c(4L,6L))) +# get and mget use cases +d = as.data.table(lapply(1:5, rep, 2L)) +setnames(d, paste0("c",1:5)) +v1 = "c1"; v2 = "c2"; v3 = "c3"; v4 = "c4"; v5 = "c5" +test(11.061, d[, v1, env=list(v1=v1)], d[, get(v1)]) ## symbol c1 +test(11.062, d[, v1, env=list(v1=I(v1))], data.table(c1=c(1L,1L))) ## character "c1" +test(11.063, d[, list(v1), env=list(v1=v1)], d[, mget(v1)]) ## symbol c1 in list +test(11.064, d[, v1v2, env=list(v1v2=I(c(v1,v2)))], d[, mget(c(v1, v2))]) ## character c("c1","c2") +test(11.065, d[, v1v2, env=list(v1v2=as.list(c(v1,v2)))], d[, mget(c(v1, v2))]) ## call list(c1,c2) ## auto-enlist +test(11.066, d[, .(v1), env=list(v1=v1)], data.table(c1=c(1L,1L))) ## d[, .(get(v1))] - (m)get would return unnamed columns +test(11.067, d[, .(v1, v2), env=list(v1=v1, v2=v2)], data.table(c1=c(1L,1L),c2=c(2L,2L))) ## d[, .(get(v1), get(v2))] +test(11.068, d[, .(sum(v1)), env=list(v1=v1)], d[, .(sum(get(v1)))]) +test(11.069, d[, lapply(vN, sum), env=list(vN=as.list(setNames(nm = c(v1, v3))))], d[, lapply(mget(c(v1,v3)), sum)]) +test(11.070, d[, c(list(c1=c1, c2=c2), list(v3=v3), list(v4=v4, v5=v5)), env=list(v3=v3,v4=v4,v5=v5)], d) ## d[, c(list(c1, c2), list(get(v3)), mget(c(v4,v5)))] - some are unnamed +# empty input +d = data.table(x=1:2, y=1:4) +test(11.081, d[.i, env=list(.i=substitute()), verbose=TRUE], d, notOutput="after substitute") +test(11.082, d[.i, .j, .by, env=list(.i=substitute(), .j=substitute(), .by=substitute()), verbose=TRUE], d, notOutput="after substitute") +f = function(x, i, j, by) { + x[.i, .j, .by, env=list(.i=substitute(i), .j=substitute(j), .by=substitute(by)), verbose=TRUE] +} +test(11.083, f(d), d) +test(11.084, f(d, 1), d[1], output="Argument 'i' after substitute", notOutput="Argument 'j' after substitute") +test(11.085, f(d,, 1), d[,1], output="Argument 'j' after substitute", notOutput="Argument 'i' after substitute") +test(11.086, f(d, 1, 1), d[1, 1], output="Argument 'j' after substitute.*Argument 'i' after substitute") + +#1985 weird exception when by contains get +tb = data.table(x=c(1,2), y=c(3,4), z=c(5,6), w=c("a","b")) +test(11.101, tb[w != "b", .(x=sum(x)), by=.(y, zz=.z), env=list(.z="z")], data.table(y=3, zz=5, x=1)) +dtIris = as.data.table(iris) +speciesVar = "Species" +test(11.102, dtIris[Sepal.Length > 4, .N, by = .(var = .speciesVar, Petal.Width), env = list(.speciesVar = speciesVar)], dtIris[Sepal.Length > 4, .N, by = .(var = Species, Petal.Width)]) +#2589 Need an easier way to use dynamically determined symbols +dt = data.table(x1 = 1:10, x2 = 10:1, x3 = 1:10) +s1 = "x2"; s2 = "x3" +test(11.103, dt[, s1 * s2, env=list(s1=s1,s2=s2)], c(10L, 18L, 24L, 28L, 30L, 30L, 28L, 24L, 18L, 10L)) +#2884 Alternative way to dynamic symbol usage in `j` +dt = data.table(id = rep(1:2, 5), x1 = rnorm(10), x2 = rnorm(10), y1 = rnorm(10), y2 = rnorm(10)) +test(11.104, dt[, .(xsum = sum(x), ysum = sum(y)), by = id, env = list(x = "x1", y = "y2")], dt[, .(xsum=sum(x1), ysum=sum(y2)), by=id]) +#2816 Possible regression for programmatic use in `j` +dt = data.table(x=1:3) +var = "x" +dt[, var := var+1L, env=list(var="x")] +test(11.105, dt, data.table(x=2:4)) +# injecting quoted expressions +#750 `by=list(eval(as.name("colA")))` renames column +DT = data.table(colA=1:4, colB=5:8, colC=9:12) +test(11.106, DT[, sum(colA), by=list(grp_name=grp), env=list(grp_name="colA", grp="colA")], data.table(colA=1:4, V1=1:4)) +#2432 Add Programmable NSE +co2 = as.data.table(CO2) +Jexp1 = quote(max(conc)) +Jexp2 = quote(mean(conc)) +Jexp = substitute(list(Jexp1, round(Jexp2)), list(Jexp1=Jexp1, Jexp2=Jexp2)) +out = capture.output(ans <- co2[, j, by=Type, env=list(j=Jexp), verbose=TRUE]) +test(11.107, ans, data.table(Type=factor(c("Quebec","Mississippi"), levels=c("Quebec","Mississippi")), V1=c(1000,1000), V2=c(435,435))) +out = grep("Argument.*substitute", out, value=TRUE) +test(11.108, length(out), 2L) +test(11.109, "Argument 'by' after substitute: Type" %in% out, TRUE) +test(11.110, "Argument 'j' after substitute: list(max(conc), round(mean(conc)))" %in% out, TRUE) +#628 Change j=list(xout=eval(...))'s eval to eval within scope of DT +dat = data.table(x_one=1:10, x_two=1:10, y_one=1:10, y_two=1:10) +f = function(vars) as.call(c(quote(list), lapply(setNames(vars, paste(vars,"out",sep="_")), function(var) substitute2(one-two, list(one=paste(var,"one",sep="_"), two=paste(var,"two",sep="_")))))) +test(11.111, dat[, j, env=list(j = f(c("x","y")))], dat[, list(x_out = x_one - x_two, y_out = y_one - y_two)]) + +# vignette examples +square = function(x) x^2 +test(12.01, + substitute2(outer(inner(var1) + inner(var2)), env = list(outer = "sqrt", inner = "square", var1 = "a", var2 = "b")), + quote(sqrt(square(a) + square(b)))) +DT = as.data.table(iris) +test(12.02, + DT[, outer(inner(var1) + inner(var2)), env = list(outer = "sqrt", inner = "square", var1 = "Sepal.Length", var2 = "Sepal.Width")], + DT[, sqrt(square(Sepal.Length) + square(Sepal.Width))]) +test(12.03, # return as data.table, substitute call argument name + DT[, .(Species, var1, var2, out = outer(inner(var1) + inner(var2))), env = list(outer = "sqrt", inner = "square", var1 = "Sepal.Length", var2 = "Sepal.Width", out = "Sepal.Hypotenuse")], + DT[, .(Species, Sepal.Length, Sepal.Width, Sepal.Hypotenuse = sqrt(square(Sepal.Length) + square(Sepal.Width)))]) +test(12.04, # i, j, by + DT[filter_col %in% filter_val, .(var1, var2, out = outer(inner(var1) + inner(var2))), by = by_col, env = list(outer = "sqrt", inner = "square", var1 = "Sepal.Length", var2 = "Sepal.Width", out = "Sepal.Hypotenuse", filter_col = "Species", filter_val = I(c("versicolor", "virginica")), by_col = "Species")], + DT[Species %in% c("versicolor","virginica"), .(Sepal.Length, Sepal.Width, Sepal.Hypotenuse = sqrt(square(Sepal.Length) + square(Sepal.Width))), by = Species]) +test(12.05, # like base R, env AsIs class + substitute2(rank(input, ties.method = ties), env = I(list(input = as.name("Sepal.Width"), ties = "first"))), + quote(rank(Sepal.Width, ties.method = "first"))) +test(12.06, # only particular elements of env are AsIs class + substitute2(rank(input, ties.method = ties), env = list(input = "Sepal.Width", ties = I("first"))), + quote(rank(Sepal.Width, ties.method = "first"))) +test(12.07, # all are symbols + substitute2(f(v1, v2), list(v1 = "a", v2 = list("b", list("c", "d")))), + quote(f(a, list(b, list(c, d))))) +test(12.08, # 'a' and 'd' should stay as character + substitute2(f(v1, v2), list(v1 = I("a"), v2 = list("b", list("c", I("d"))))), + quote(f("a", list(b, list(c, "d"))))) +cols = c("Sepal.Length", "Sepal.Width") +test(12.09, # data.table automatically enlist nested lists into list calls + DT[, j, env = list(j = as.list(cols))], + DT[, list(Sepal.Length, Sepal.Width)]) +test(12.10, # turning above 'j' list into a list call + DT[, j, env = list(j = quote(list(Sepal.Length, Sepal.Width)))], + DT[, list(Sepal.Length, Sepal.Width)]) +test(12.11, # the same as above but accepts character vector + DT[, j, env = list(j = as.call(c(quote(list), lapply(cols, as.name))))], + DT[, list(Sepal.Length, Sepal.Width)]) +test(12.12, # list of symbols + DT[, j, env = I(list(j = lapply(cols, as.name))), verbose = TRUE], + error = "j-argument should be", + output = "list(Sepal.Length, Sepal.Width)") +test(12.13, substitute2(j, env = I(list(j = lapply(cols, as.name)))), lapply(cols, as.name)) +test(12.14, substitute2(j, env = list(j = as.list(cols))), as.call(c(quote(list), lapply(cols, as.name)))) +outer = "sqrt"; inner = "square"; vars = c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width") +syms = lapply(vars, as.name) +to_inner_call = function(var, fun) call(fun, var) +inner_calls = lapply(syms, to_inner_call, inner) +test(12.15, inner_calls, list(quote(square(Sepal.Length)), quote(square(Sepal.Width)), quote(square(Petal.Length)), quote(square(Petal.Width)))) +to_add_call = function(x, y) call("+", x, y) +add_calls = Reduce(to_add_call, inner_calls) +test(12.16, add_calls, quote(square(Sepal.Length) + square(Sepal.Width) + square(Petal.Length) + square(Petal.Width))) +rms = substitute2(expr = outer((add_calls) / len), env = list(outer = outer, add_calls = add_calls, len = length(vars))) +test(12.17, rms, quote(sqrt((square(Sepal.Length) + square(Sepal.Width) + square(Petal.Length) + square(Petal.Width))/4L))) +test(12.18, + DT[, j, env = list(j = rms)], + DT[, sqrt((square(Sepal.Length) + square(Sepal.Width) + square(Petal.Length) + square(Petal.Width))/4L)]) +test(12.19, # same but skipping last substitute2 call and using add_calls directly + DT[, outer((add_calls) / len), env = list(outer = outer, add_calls = add_calls, len = length(vars))], + DT[, sqrt((square(Sepal.Length) + square(Sepal.Width) + square(Petal.Length) + square(Petal.Width))/4L)]) +j = substitute2(j, list(j = as.list(setNames(nm = c(vars, "Species", "rms"))))) # return as data.table +j[["rms"]] = rms +test(12.20, + DT[, j, env = list(j = j)], + DT[, .(Sepal.Length=Sepal.Length, Sepal.Width=Sepal.Width, Petal.Length=Petal.Length, Petal.Width=Petal.Width, Species, rms = sqrt((square(Sepal.Length) + square(Sepal.Width) + square(Petal.Length) + square(Petal.Width))/4L))]) +j = as.call(c( # alternatively + quote(list), + lapply(setNames(nm = vars), as.name), + list(Species = as.name("Species")), + list(rms = rms) +)) +test(12.21, + DT[, j, env = list(j = j)], + DT[, .(Sepal.Length=Sepal.Length, Sepal.Width=Sepal.Width, Petal.Length=Petal.Length, Petal.Width=Petal.Width, Species, rms = sqrt((square(Sepal.Length) + square(Sepal.Width) + square(Petal.Length) + square(Petal.Width))/4L))]) +v1 = "Petal.Width" # get +v2 = "Sepal.Width" +test(12.22, + DT[, .(total = sum(v1, v2)), env = list(v1 = v1, v2 = v2)], + DT[, .(total = sum(get(v1), get(v2)))]) +v = c("Petal.Width", "Sepal.Width") # mget +test(12.23, + DT[, lapply(v, mean), env = list(v = as.list(v))], + DT[, lapply(list(Petal.Width, Sepal.Width), mean)]) +test(12.24, + DT[, lapply(v, mean), env = list(v = as.list(setNames(nm = v)))], + DT[, lapply(mget(v), mean)]) +cl = quote(.(Petal.Width = mean(Petal.Width), Sepal.Width = mean(Sepal.Width))) +test(12.25, DT[, cl, env = list(cl = cl)], DT[, eval(cl)]) + +####################### +# contributed use cases +####################### + +# renkun-ken +dt = as.data.table(list( ## RNGversion("3.5.0"); set.seed(108); round(numeric(), 4) + symbol = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), + date = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L), + grp1 = c(1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 1L), + grp2 = c(3L, 3L, 3L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 3L, 2L, 1L, 3L, 3L, 1L, 1L, 3L, 3L, 1L, 3L, 3L, 2L, 1L, 2L, 2L, 3L, 2L), + x0 = c(1.1396, -0.2706, -2.2801, -0.1572, -1.0671, -0.9666, -0.8071, -0.23, -0.1626, 1.4347, -0.2234, 0.5613, -0.7084, 0.2598, -0.2023, 1.8624, 0.5209, -1.561, -1.2297, -1.0064, -0.9782, -0.1291, -2.275, 0.5268, -0.5316, 2.3234, 0.0556, -0.3623, -0.5695, -0.0142), + x1 = c(1.3553, 1.2909, -0.8958, -0.3677, 1.0041, 1.1247, -0.0595, 0.7503, 0.3503, -1.559, -1.6823, -0.0906, 0.7874, 0.2785, -0.1712, -1.5325, 0.408, 0.5981, -1.1464, -0.2233, -0.0635, 0.4461, -1.9813, -0.7281, 1.1216, -0.0516, 1.373, 0.2388, 0.6257, -0.0551), + x2 = c(-0.2457, -0.9797, 0.3957, -1.094, -1.1973, 0.3137, 0.2004, -1.9404, 1.6927, -0.4063, 0.0731, -0.3338, -2.2683, -1.1105, 0.2115, -0.0163, 0.2139, 0.5016, 0.2296, 0.4189, 0.3295, 0.0408, 1.4633, -0.7118, 0.4811, 0.4499, -0.4214, 0.1503, -0.2222, 0.4573), + x3 = c(1.3439, 0.3841, -0.4787, -0.6312, -0.5481, -0.8703, -1.2684, -1.4851, 0.6789, 0.1575, 2.7873, -1.1201, 0.1337, -0.6053, -0.6538, 0.4597, -0.8955, 0.1625, 1.3767, 0.6024, -1.2141, -1.3534, -0.6583, -0.095, 1.1923, 0.3062, -0.6818, 0.2407, -0.8534, -1.4521), + y1 = c(-0.2159, 0.8934, 0.0216, -1.0682, 1.2549, -0.1517, 1.4404, 1.3436, -2.1388, -0.2453, -1.4628, -1.7654, 0.6437, -0.9685, -0.9393, 0.0962, -0.2041, 1.1007, -1.8705, 0.2053, -0.9238, -0.6301, 1.9876, 1.2862, 0.3363, -0.334, -1.5149, -1.3254, 0.5716, -0.7165), + y2 = c(-0.5962, 0.3394, -0.2971, -0.6241, -0.5279, 1.1945, -0.152, 0.8207, 0.8731, 0.2281, 0.3466, -1.4862, -0.4694, 0.0435, 0.9888, -0.0797, 0.7109, -0.6636, -0.4402, 1.0093, -0.0655, 0.5099, 1.5415, 1.8833, -1.2365, 0.5085, 0.7073, -0.2191, 0.2442, 0.1501), + y3 = c(0.6222, -0.7174, -1.9616, -0.0117, -0.114, 0.1313, -1.3854, 1.5021, -0.7115, 0.4822, 1.8474, 1.1742, 0.8192, 0.2819, -1.3365, -0.6179, -0.9706, 0.2179, -1.2654, 1.0065, -2.2514, -0.7161, 0.9578, -0.0335, 0.3166, 0.0471, -0.9983, -0.6455, 1.4064, 0.2954))) +xs = c("x", "y") ## apply same formula to different set of columns +out = vector("list", length(xs)) +names(out) = xs +for (x in xs) { + out[[x]] = capture.output(invisible(dt[, RATIO := (R3 - R2) * (R2 - R1) * (R3 - R1) / sqrt(R1^2 + R2^2 + R3^2), + env = list(RATIO = paste0(x, "_ratio"), R1 = paste0(x, 1), R2 = paste0(x, 2), R3 = paste0(x, 3)), + verbose = TRUE])) # assign to nul, other +} +x_rat = c(0.0150761734954921, 1.68603966340262, -0.432117480975587, 0.0673302370985585, +1.3396117186265, -1.31542975195976, 0.358990921654875, 1.07137398842599, -0.240804570258909, 0.689134697166349, 6.53944855876942, -0.167936293758913, 1.99518595021054, 0.478886131900058, 0.225672526235629, 0.898595029001403, -0.278725254056844, -0.0178774591562397, 2.20493313305713, 0.126869315798536, 0.554130827073314, -0.713268530169861, -3.79227895596263, 0.00622410754980975, -0.0188758915276097, -0.0471688415642347, -0.60391972591766, -4.09856489441073e-05, -0.732101471917737, 0.897197218930381) +y_rat = c(-0.437137931952723, -0.789182136098114, -0.530238437504097, 0.232242653273211, 0.739369921650875, -0.334413400872578, -2.76908561851941, -0.0259528361203494, -2.81810697204509, 0.149050554297973, 3.77409495341661, 0.84329199487865, -0.220290266022232, 0.298795199314652, 0.932599183107379, -0.107238527606129, 0.966425089066359, 1.05320054480325, -0.310406226974414, -0.00125245906648534, 1.02314586034282, 0.111130598215941, -0.0996278782862306, 0.66222170820334, 0.0364570881136429, -0.242779893874194, -1.00552326863148, -0.215191768368067, -0.206580227824426, 0.16140646232964) +test(101.01, dt$x_ratio, x_rat) +test(101.02, dt$y_ratio, y_rat) +test(101.03, length(grep("Argument.*substitute", out[["x"]], value=TRUE)), 1L) +test(101.04, length(grep("Argument.*substitute", out[["y"]], value=TRUE)), 1L) +test(101.05, "Argument 'j' after substitute: `:=`(x_ratio, (x3 - x2) * (x2 - x1) * (x3 - x1)/sqrt(x1^2 + x2^2 + x3^2))" %in% out[["x"]], TRUE) +test(101.06, "Argument 'j' after substitute: `:=`(y_ratio, (y3 - y2) * (y2 - y1) * (y3 - y1)/sqrt(y1^2 + y2^2 + y3^2))" %in% out[["y"]], TRUE) +daily_cor = function(data, x, y) { ## daily correlation of user input features + data[, .(cor = cor(x, y)), + keyby = date, + env = list(x = x, y = y), + verbose = TRUE] +} +out = capture.output(ans <- daily_cor(dt, "x0", "y2")) +test(101.07, length(grep("Argument.*substitute", out, value=TRUE)), 2L) ## 'by' (or 'keyby') is not substituted here but it still goes via substitute2 because it is non-missing +test(101.08, "Argument 'by' after substitute: date" %in% out, TRUE) +test(101.09, "Argument 'j' after substitute: .(cor = cor(x0, y2))" %in% out, TRUE) +group_cor = function(data, x, y, g) { ## group cor comparison of user input features + cor_dt = data[, lapply(.SD, function(x) cor(x, Y)), + keyby = .(group = GROUP), + .SDcols = x, + env = list(Y = y, GROUP = g), + verbose = TRUE] + melt.data.table(cor_dt, id.vars = "group", measure.vars = x, variable.name = "x", value.name = "cor", variable.factor = FALSE) ## not relevant but lets keep it for completeness +} +out = capture.output(dt1 <- group_cor(dt, c("x0", "x1", "x2"), "y1", "grp1")) +test(101.10, length(grep("Argument.*substitute", out, value=TRUE)), 2L) +test(101.11, "Argument 'by' after substitute: .(group = grp1)" %in% out, TRUE) +test(101.12, "Argument 'j' after substitute: lapply(.SD, function(x) cor(x, y1))" %in% out, TRUE) +out = capture.output(dt2 <- group_cor(dt, c("x0", "x1", "x2"), "y1", "grp2")) +test(101.13, length(grep("Argument.*substitute", out, value=TRUE)), 2L) +test(101.14, "Argument 'by' after substitute: .(group = grp2)" %in% out, TRUE) +test(101.15, "Argument 'j' after substitute: lapply(.SD, function(x) cor(x, y1))" %in% out, TRUE) +stats_dt1 = as.data.table(list( + x = c("x0", "x1", "x2"), + min = c(-0.325967794724422, -0.126026585686073, -0.398950077203113), + mean = c(-0.277318407860876, -0.0164428001010045, -0.220868266148565), + max = c(-0.22866902099733, 0.0931409854840638, -0.0427864550940165) +), key="x") +test(101.16, dt1[, .(min = min(cor), mean = mean(cor), max = max(cor)), keyby = x], stats_dt1) ## post aggregation with known colnames, not relevant but lets keep it for completeness +stats_dt2 = as.data.table(list( + x = c("x0", "x1", "x2"), + min = c(-0.392714958827804, -0.339274985404091, -0.45937864657761), + mean = c(-0.279968323960171, 0.150866984990403, 0.0838779176840593), + max = c(-0.180337725136444, 0.697473394580653, 0.714679537878464) +), key="x") +test(101.17, dt2[, .(min = min(cor), mean = mean(cor), max = max(cor)), keyby = x], stats_dt2) +set.seed(108) ## to many values to hardcode +yn = c(1, 5, 10, 20) +ycols = paste0("y", yn) +ydt = data.table(symbol = rep(1:3, each = 100)) +ydt[, date := seq_len(.N), by = symbol] +ydt[, ret := rnorm(.N)] +ydt[, (ycols) := shift(ret, yn, type = "lead"), by = symbol] +xdt = data.table(symbol = rep(1:2, each = 20)) +xdt[, date := seq_len(.N), by = symbol] +xdt[, `:=`(x1 = rnorm(.N), x2 = rnorm(.N))] +cor_xy = function(xdt, ydt, x, y) { ## cor between each x and a single y + xdt[ydt, y := Y, on = .(symbol, date), + env = list(Y = y), + verbose = TRUE] + on.exit(xdt[, y := NULL]) + xdt[, lapply(.SD, cor, y = y), keyby = symbol, .SDcols = x] +} +out = capture.output(ans <- cor_xy(xdt, ydt, c("x1", "x2"), "y10")) +exp = as.data.table(list(symbol = 1:2, x1 = c(0.529292252112253, 0.0301956035638738), x2 = c(0.287076866252898, -0.335969587268599)), key="symbol") +test(102.01, ans, exp) +test(102.02, length(grep("Argument.*substitute", out, value=TRUE)), 2L) +test(102.03, "Argument 'j' after substitute: `:=`(y, y10)" %in% out, TRUE) +test(102.04, "Argument 'i' after substitute: ydt" %in% out, TRUE) +cor_xy2 = function(xdt, ydt, x, y) { ## cor between each pair of x and y + rbindlist(lapply(y, function(yi) { + xdt[ydt, y := Y, on = .(symbol, date), + env = list(Y = yi)] + on.exit(xdt[, y := NULL]) + rbindlist(lapply(x, function(xi) { + xdt[, .(x = xi, y = yi, cor = cor(X, y)), keyby = symbol, + env = list(X = xi)] + })) + })) +} +cor_dt = cor_xy2(xdt, ydt, c("x1", "x2"), ycols) +exp = as.data.table(list( + symbol = c(1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L), + x = c("x1", "x1", "x2", "x2", "x1", "x1", "x2", "x2", "x1", "x1", "x2", "x2", "x1", "x1", "x2", "x2"), + y = c("y1", "y1", "y1", "y1", "y5", "y5", "y5", "y5", "y10", "y10", "y10", "y10", "y20", "y20", "y20", "y20"), + cor = c(0.0963296961360529, -0.155702586981777, 0.45855688298414, -0.0867798048307359, -0.272158447799069, 0.0969909109333228, -0.172091337596075, -0.231918279862371, 0.529292252112253, 0.0301956035638738, 0.287076866252898, -0.335969587268599, 0.489259093604126, 0.190094143537513, 0.382176633086643, -0.0481151265706696) +)) +test(102.05, cor_dt, exp) +cor_xy3 = function(xdt, ydt, x, y) { ## cor matrix of existing columns and dynamically in-place merged columns + cl = as.call(lapply(setNames(c(":=", y), c("", y)), as.name)) + xdt[ydt, j, on = .(symbol, date), + env = list(j=cl)] + on.exit(xdt[, (y) := NULL]) + xdt[, cor(.SD), .SDcols = c(x, y)] +} +cor_mx = cor_xy3(xdt, ydt, c("x1", "x2"), ycols) +exp = structure(c( + 1, 0.242249239102964, -0.0286729531730845, -0.0936087330415663, 0.245575245812681, 0.323778522797129, 0.242249239102964, 1, 0.199165327684089, -0.160954354243643, 0.0034174556771777, 0.185518712777259, -0.0286729531730845, 0.199165327684089, 1, -0.164047186655086, -0.0689536633998918, -0.0326400434160486, -0.0936087330415663, -0.160954354243643, -0.164047186655086, 1, -0.0810998892055976, -0.106457956110047, 0.245575245812681, 0.0034174556771777, -0.0689536633998918, -0.0810998892055976, 1, 0.324977066952494, 0.323778522797129, 0.185518712777259, -0.0326400434160486, -0.106457956110047, 0.324977066952494, 1 + ), .Dim = c(6L, 6L), .Dimnames = list( + c("x1", "x2", "y1", "y5", "y10", "y20"), + c("x1", "x2", "y1", "y5", "y10", "y20") +)) +test(102.06, cor_mx, exp) +nadt = data.table(x1 = c(1, 2, NA, Inf), x2 = c(2, NA, 3, Inf), x3 = c(NA, 1, 2, 0)) ## fill abnormal values of multiple columns +dt_fill = function(data, columns, selector, fill) { + selector = match.fun(selector) + for (col in columns) { + data[selector(X), X := fill, env = list(X = col)] + } +} +dt_fill(nadt, c("x1", "x2", "x3"), is.na, 0) +test(103.01, nadt, data.table(x1 = c(1, 2, 0, Inf), x2 = c(2, 0, 3, Inf), x3 = c(0, 1, 2, 0))) +dt_fill(nadt, c("x1", "x2", "x3"), is.infinite, 0) +test(103.02, nadt, data.table(x1 = c(1, 2, 0, 0), x2 = c(2, 0, 3, 0), x3 = c(0, 1, 2, 0))) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index f3adef959d..9514dd5820 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -30,6 +30,7 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { compactprint = data.table:::compactprint cube.data.table = data.table:::cube.data.table dcast.data.table = data.table:::dcast.data.table + if (!exists('endsWith', 'package:base', inherits=FALSE)) endsWith = data.table:::endsWith forder = data.table:::forder forderv = data.table:::forderv format.data.table = data.table:::format.data.table @@ -53,6 +54,7 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { shallow = data.table:::shallow # until exported .shallow = data.table:::.shallow split.data.table = data.table:::split.data.table + if (!exists('startsWith', 'package:base', inherits=FALSE)) startsWith = data.table:::startsWith test = data.table:::test uniqlengths = data.table:::uniqlengths uniqlist = data.table:::uniqlist @@ -101,6 +103,42 @@ if (!test_longdouble) { # e.g. under valgrind, longdouble.digits==53; causing these to fail: 1262, 1729.04, 1729.08, 1729.09, 1729.11, 1729.13, 1830.7; #4639 } +# generate simple error messages from base that are checked against in our tests. this helps +# protect us against these messages evolving in base in the future, and against these messages +# potentially not being produced in English. +# Three use cases: +# (1) match message exactly [missing delim] +# (2) match message pattern after dropping anything between delimeters [delim, fmt=FALSE] +# (3) function factory for matching messages exactly by substituting anything between delimeters [delim, fmt=TRUE] +get_msg = function(e, delim, fmt=FALSE) { + msg = tryCatch(e, error=identity, warning=identity)$message + if (missing(delim)) return(msg) + if (length(delim) == 1L) delim[2L] = delim[1L] + msg = gsub( + sprintf("%1$s[^%2$s]+%2$s", delim[1L], delim[2L]), + sprintf("%s%s%s", delim[1L], if (fmt) "%s" else ".+", delim[2L]), + msg + ) + if (fmt) return(function(x) sprintf(msg, x)) + return(msg) +} +base_messages = list( + missing_object = get_msg(`__dt_test_missing_` + 1, "'", fmt=TRUE), + missing_function = get_msg(`__dt_test_missing_`(), '"', fmt=TRUE), + invalid_arg_unary_operator = get_msg(-'a'), + invalid_arg_binary_operator = get_msg(1 + 'a'), + invalid_arg_sum = get_msg(sum('a'), c("\\(", "\\)"), fmt=TRUE), + arg_length_mismatch = get_msg(base::order(1, 1:2)), + empty_max = get_msg(max(numeric())), + empty_min = get_msg(min(numeric())), + coerce_na = get_msg(as.integer('a')), + locked_binding = get_msg({e = new.env(); e$x = 1; lockBinding('x', e); e$x = 2}, "'", fmt=TRUE), + missing_file = get_msg({tmp <- tempfile(tmpdir=tempfile("xxx")); file(tmp, "w")}, "'"), + # gives both error & warning but tryCatch returns the warning first, so suppress + cant_open_file = get_msg(suppressWarnings({con<-file(tempfile()); open(con, 'r')})), + mixed_subscripts = get_msg(letters[-1:1]) +) + ########################## test(1.1, tables(env=new.env()), null.data.table(), output = "No objects of class") @@ -977,7 +1015,7 @@ DT = data.table(a=1:5, b=6:10, c=11:15) test(327, within(DT,rm(a,b)), data.table(c=11:15)) test(328, within(DT,rm(b,c)), data.table(a=1:5)) test(329, within(DT,rm(b,a)), data.table(c=11:15)) -test(330, within(DT,rm(b,c,d)), data.table(a=1:5), warning="object 'd' not found") +test(330, within(DT,rm(b,c,d)), data.table(a=1:5), warning=base_messages$missing_object("d")) DT[,c("b","a")]=NULL test(332, DT, data.table(c=11:15)) test(333, within(DT,rm(c)), data.table(NULL)) @@ -1119,8 +1157,8 @@ test(378, cbind(), NULL) test(379, rbind(), NULL) DT = data.table(a=rep(1:3,1:3),b=1:6) -test(380, DT[,{.SD$b[1]=10L;.SD}, by=a], error="locked binding") # .SD locked for 1st group -test(381, DT[,{if (a==2) {.SD$b[1]=10L;.SD} else .SD}, by=a], error="locked binding") # .SD locked in 2nd group onwards too +test(380, DT[,{.SD$b[1]=10L;.SD}, by=a], error=base_messages$locked_binding(".SD")) # .SD locked for 1st group +test(381, DT[,{if (a==2) {.SD$b[1]=10L;.SD} else .SD}, by=a], error=base_messages$locked_binding(".SD")) # .SD locked in 2nd group onwards too # test that direct := is trapped, but := within a copy of .SD is allowed (FAQ 4.5). See also tests 556-557. test(382, DT[,b:=.N*2L,by=a], data.table(a=rep(1:3,1:3),b=rep(2L*(1:3),1:3))) @@ -1588,7 +1626,7 @@ test(534, names(transform(data.table('a b'=1), `c d`=`a b`)), c("a b","c d")) # Test keyby, new in v1.8.0 DT = data.table(a=INT(1,3,1,2,3,2),b=1:2,c=1:3,v=1:6) -test(535, DT[,sum(v),by=a, keyby=a], error="not both") +test(535, DT[,sum(v),by=a, keyby=a], error="When.*both.*keyby must be TRUE or FALSE") # updated after #4307 test(536, DT[,sum(v),by=a], data.table(a=c(1L,3L,2L),V1=c(4L,7L,10L))) # retains appearance order ans = data.table(a=1:3,V1=c(4L,10L,7L),key="a") test(537, DT[,sum(v),keyby=a], ans) @@ -1672,7 +1710,7 @@ test(570.1, DT[,list(.I=.I),list(a,b)][,.I,a], error="The column '.I' can't be g DT = data.table("a "=1:2, "b"=3:4," b"=5:6, v=1:6) test(571, DT[,sum(v),by="b, b"], data.table("b"=3:4, " b"=5:6, V1=c(9L,12L))) test(572, DT[,sum(v),by="a , b"], data.table("a "=1:2, " b"=5:6, V1=c(9L,12L))) -test(573, DT[,sum(v),by="b, a"], error="object ' a' not found") +test(573, DT[,sum(v),by="b, a"], error=base_messages$missing_object(" a")) # Test base::unname, used by melt, and only supported by data.table for DF compatibility for non-dtaware packages DT = data.table(a=1:3, b=4:6) @@ -2036,7 +2074,7 @@ if (ncol(DT)==2L) setnames(DT,c("A","B")) # else don't stop under torture with s test(714, DT[,z:=6:10], data.table(A=1:5,B=5,z=6:10)) # Test J alias is now removed outside DT[...] from v1.8.7 (to resolve rJava::J conflict) -test(715, J(a=1:3,b=4), error="could not find function.*J") +test(715, J(a=1:3,b=4), error=base_messages$missing_function("J")) # Test get in j DT = data.table(a=1:3,b=4:6) @@ -2254,7 +2292,7 @@ test(811, DT[c("b","foo","c"),which=NA,nomatch=0], error="which=NA with nomatch= DT = data.table(a=1:3,b=4:6,c=7:9) # old tests using with=FALSE retained. Eventually will deprecate with=FALSE. test(812.1, DT[,!"b",with=FALSE], DT[,-match("b",names(DT)),with=FALSE]) -test(812.2, DT[,"foo",with=FALSE], error="column(s) not found: foo") +test(812.2, DT[,"foo",with=FALSE], error="column(s) not found: [foo]") test(812.3, DT[,!"foo",with=FALSE], DT, warning="column(s) not removed because not found: [foo]") test(812.4, DT[,!c("b","foo"),with=FALSE], DT[,list(a,c)], warning="column(s) not removed because not found: [foo]") test(812.5, DT[,!2:3,with=FALSE], DT[,-(2:3),with=FALSE]) # for consistency, but ! is really for character column names @@ -2274,7 +2312,7 @@ test(813.4, rownames(DT[2,"a"]), "1") # also repeat 812.* but without with=FALSE since that will be deprecated in future, and cover - as well as ! test(814.01, DT[,!"b"], DT[,c("a","c")]) test(814.02, DT[,-"b"], DT[,c("a","c")]) -test(814.03, DT[,"foo"], error="column(s) not found: foo") +test(814.03, DT[,"foo"], error="column(s) not found: [foo]") test(814.04, DT[,!"foo"], DT, warning="column(s) not removed because not found: [foo]") test(814.05, DT[,-"foo"], DT, warning="column(s) not removed because not found: [foo]") test(814.06, DT[,!c("b","foo")], DT[,list(a,c)], warning="column(s) not removed because not found: [foo]") @@ -2325,8 +2363,8 @@ test(827.1, names(a[b]), c("User ID","Blah Blah","Yadda Yadda")) # setcolorder and merge check for dup column names, #2193(ii) setnames(DT2,"b","a") test(828, setcolorder(DT2,c("a","b")), error="x has some duplicated column name(s): a. Please remove or rename") -test(829, merge(DT1,DT2), error="y has some duplicated column name(s): a. Please remove or rename") -test(830, merge(DT2,DT1), error="x has some duplicated column name(s): a. Please remove or rename") +test(829, merge(DT1,DT2), error="y has some duplicated column name(s): [a]. Please remove or rename") +test(830, merge(DT2,DT1), error="x has some duplicated column name(s): [a]. Please remove or rename") # attribs such as "comments" should be retained, #2270 DT1 <- data.table(id = seq.int(1, 10), A = LETTERS[1:10], key = "id") @@ -3014,6 +3052,14 @@ test(1034, as.data.table(x<-as.character(sample(letters, 5))), data.table(V1=x)) error="Unknown 'id.vars' type raw") test(1035.012, melt(DT, id.vars=1:3, measure.vars=as.raw(0)), error="Unknown 'measure.vars' type raw") + test(1035.013, melt(data.table(a=1, b=1), id.vars=c(1,1)), data.table(a=1, a.1=1, variable=factor("b"), value=1), + output="Duplicate column names found") + test(1035.014, melt(data.table(a1=1, b1=1, b2=2), na.rm=TRUE, measure.vars=list(a="a1", b=c("b1","b2"))), data.table(variable=factor(1,c("1","2")), a=1, b=1)) + test(1035.015, melt(data.table(a=1+2i, b=1), id.vars="a"), error="Unknown column type 'complex' for column 'a' in 'data'") + + # na.rm=TRUE with list column value, PR#4737 + test(1035.016, melt(data.table(a1=1, b1=list(1:2), b2=list(c('foo','bar'))), na.rm=TRUE, measure.vars=list(a="a1", b=c("b1","b2"))), data.table(variable=factor(1), a=1, b=list(1:2))) + test(1035.017, melt(data.table(a1=1, b1=1, b2=2), na.rm=TRUE, measure.vars=list(a="a1", b=c("b1","b2"))), data.table(variable=factor(1), a=1, b=1))#this worked even before the PR. ans1 = cbind(DT[, c(1,2,8), with=FALSE], variable=factor("l_1")) ans1[, value := DT$l_1] @@ -3037,7 +3083,7 @@ test(1034, as.data.table(x<-as.character(sample(letters, 5))), data.table(V1=x)) test(1035.051, ans1, melt(DT, id.vars="id", measure.vars=list(c(5, 6), c(7, 8)))) test(1035.052, melt(DT, id.vars="id", measure.vars=list(as.raw(0))), error="Unknown 'measure.vars' type raw") - test(1035.06, ans1, melt(DT, id.vars="id", measure.vars=list(5:6, 7:8), na.rm=TRUE)) # should've no effect + test(1035.06, na.omit(ans1), melt(DT, id.vars="id", measure.vars=list(5:6, 7:8), na.rm=TRUE)) test(1035.07, ans1, melt(DT, id.vars="id", measure.vars=patterns("d_", "l_"))) # melt retains ordered factors! test(1035.08, melt(DT, id.vars="id", measure.vars=c("f_1", "f_2"), value.factor=TRUE)$value, factor(c(as.character(DT$f_1), as.character(DT$f_2)), ordered=TRUE)) @@ -3175,9 +3221,9 @@ Sep,33.5,19.4,15.7,11.9,0,100.8,100.8,0,12.7,12.7,0,174.1") x[, c("y1","z1"):=NA] test(1037.405, dim(melt(x, measure.vars=patterns("^y", "^z"))), INT(4,5)) test(1037.406, dim(ans<-melt(x, measure.vars=patterns("^y", "^z"), na.rm=TRUE)), INT(2,5)) - test(1037.407, ans$variable, factor(c("1","1"))) + test(1037.407, ans$variable, factor(c("2","2"), c("1", "2"))) test(1037.408, dim(ans<-melt(x, measure.vars=patterns("^y", "^z"), na.rm=TRUE, variable.factor=FALSE)), INT(2,5)) - test(1037.409, ans$variable, c("1","1")) + test(1037.409, ans$variable, c("2","2")) test(1037.410, melt(data.table(NULL), verbose=TRUE), data.table(NULL), output="ncol(data) is 0. Nothing to melt") @@ -3426,7 +3472,7 @@ test(1100, dt1[dt2,roll=-Inf,rollends=c(FALSE,TRUE)]$ind, INT(NA,NA,1,2,2,2,2,2, set.seed(3) DT = data.table(a=5:1, b=runif(5)) ans = dcast(DT, a ~ b, value.var="b")[c(4,.N), c(2,6)] - setnames(ans, substring(names(ans),1,6)) + setnames(ans, substr(names(ans), 1L, 6L)) test(1102.06, ans, data.table("0.1680"=c(NA,DT[1,b]), "0.8075"=c(DT[2,b],NA))) # Fix for case 2 in bug report #71 - dcast didn't aggregate properly when formula RHS has "." @@ -3784,7 +3830,7 @@ test(1137.03, DT[, .SD, .SDcols=-"y"], DT[, c(1,3), with=FALSE]) test(1137.04, DT[, .SD, .SDcols=-c("y", "x")], DT[, 3, with=FALSE]) test(1137.05, DT[, .SD, .SDcols=-which(names(DT) %in% c("x", "y", "z"))], null.data.table()) test(1137.06, DT[, .SD, .SDcols=c(1, -2)], error=".SDcols is numeric but has both") -test(1137.07, DT[, .SD, .SDcols=c("x", -"y")], error="invalid argument to unary") +test(1137.07, DT[, .SD, .SDcols=c("x", -"y")], error=base_messages$invalid_arg_unary_operator) test(1137.08, DT[, .SD, .SDcols=c(-1, "x")], error="Some items of .SDcols are") DT <- data.table(x=1:5, y=6:10, z=11:15, zz=letters[1:5]) @@ -4527,8 +4573,7 @@ ix = with(DT, order(1-DT$x, decreasing=TRUE)) test(1251.07, DT[order(1-DT$x, decreasing=TRUE)], DT[ix]) test(1251.08, DT[order(x, list(-y), decreasing=TRUE)], error = "Column 2 is length 1 which differs from length of column 1.*10") -test(1251.09, DT[base::order(x, list(-y), decreasing=TRUE)], - error = "argument lengths differ") # data.table's error is more helpful than base's +test(1251.09, DT[base::order(x, list(-y), decreasing=TRUE)], error=base_messages$arg_length_mismatch) # data.table's error is more helpful than base's # more "edge cases" to ensure we're consistent with base test(1251.10, DT[order("a")], DT[1L]) test(1251.11, DT[order("b", "a")], DT[1L]) @@ -4907,7 +4952,7 @@ test(1290.34, DT[, names(DT) == "x", with=FALSE], as.data.table(ll[c(1,3,4)])) dt1 = data.table(a=character(0),b=numeric(0)) ans1 = data.table(a=character(0), b=numeric(0), c=numeric(0)) ans2 = data.table(a=character(0), b=numeric(0), c=numeric(0), d=integer(0)) -test(1291.1, dt1[, c:=max(b), by='a'], ans1, warning="no non-missing arguments to max") +test(1291.1, dt1[, c:=max(b), by='a'], ans1, warning=base_messages$empty_max) test(1291.2, dt1[, d := integer(0), by=a], ans2) # Bug #21 @@ -4947,7 +4992,7 @@ test(1294.02, dt[, a := 1.5]$a, rep(1L, 3L), test(1294.03, dt[, a := NA]$a, rep(NA_integer_, 3L)) test(1294.04, dt[, a := "a"]$a, rep(NA_integer_, 3L), warning=c("Coercing 'character' RHS to 'integer'.*column 1 named 'a'", - "NAs introduced by coercion")) + base_messages$coerce_na)) test(1294.05, dt[, a := list(list(1))]$a, rep(1L, 3L), warning="Coercing 'list' RHS to 'integer' to match.*column 1 named 'a'") test(1294.06, dt[, a := list(1L)]$a, rep(1L, 3L)) @@ -4957,7 +5002,7 @@ test(1294.09, dt[, b := 1L]$b, rep(1,3)) test(1294.10, dt[, b := NA]$b, rep(NA_real_,3)) test(1294.11, dt[, b := "bla"]$b, rep(NA_real_, 3), warning=c("Coercing 'character' RHS to 'double' to match.*column 2 named 'b'", - "NAs introduced by coercion")) + base_messages$coerce_na)) test(1294.12, dt[, b := list(list(1))]$b, rep(1,3), warning="Coercing 'list' RHS to 'double' to match.*column 2 named 'b'") test(1294.13, dt[, b := TRUE]$b, rep(1,3)) @@ -5121,7 +5166,8 @@ test(1313.22, DT[, list(y=max(y, na.rm=TRUE)), by=x], DT[c(5,10)]) # for character set.seed(1L) -DT <- data.table(x=rep(1:6, each=3), y=sample(c("", letters[1:3], NA), 18, TRUE)) +DT <- data.table(x=rep(1:7, each=3), y=sample(c("", letters[1:3], NA), 21, TRUE)) +DT[x==7, y := c("","b","c")] test(1313.23, DT[, min(y), by=x], DT[, base::min(y), by=x]) test(1313.24, DT[, max(y), by=x], DT[, base::max(y), by=x]) test(1313.25, DT[, min(y, na.rm=TRUE), by=x], DT[, base::min(y, na.rm=TRUE), by=x]) @@ -5129,8 +5175,8 @@ test(1313.26, DT[, max(y, na.rm=TRUE), by=x], DT[, base::max(y, na.rm=TRUE), by= DT[x==6, y := NA_character_] test(1313.27, DT[, min(y), by=x], DT[, base::min(y), by=x]) test(1313.28, DT[, max(y), by=x], DT[, base::max(y), by=x]) -test(1313.29, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=c("a","a","c","","a",NA)), warning="No non-missing") -test(1313.30, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=c("b","a","c","a","c",NA)), warning="No non-missing") +test(1313.29, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:7, V1=c("a","a","c","","a",NA,"")), warning="No non-missing") +test(1313.30, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:7, V1=c("b","a","c","a","c",NA,"c")), warning="No non-missing") # bug 700 - bmerge, roll=TRUE and nomatch=0L when i's key group occurs more than once dt1 <- data.table(structure(list(x = c(7L, 33L), y = structure(c(15912, 15912), class = "Date"), z = c(626550.35284, 7766.385)), .Names = @@ -5839,7 +5885,7 @@ test(1380, DT[a==TRUE], DT[3:4]) # Fix #847, as.data.table.list and character(0) issue x <- data.table(a=character(0), b=character(0), c=numeric(0)) setkey(x, a, b) -test(1381, x[J("foo", character(0)), nomatch=0L], x, warning="Item 2 has 0 rows but longest item has 1; filled with NA") +test(1381, x[J("foo", character(0)), nomatch=0L], x) # Fix for #813 and #758 DT = data.table(x = 1:2) @@ -5885,7 +5931,11 @@ test(1388, as.character(x), c("00:00:01", "-00:00:01", "-01:01:40")) # Fix for #880. Another eval(parse(.)) issue. DT <- as.data.table(iris) DT[, foo := "Species"] -test(1389, copy(DT)[,bar := eval(parse(text=foo[1]), envir=.SD)], copy(DT)[, bar := Species]) +test(1389.1, copy(DT)[,bar := eval(parse(text=foo[1]), envir=.SD)], copy(DT)[, bar := Species]) +# another test from #1181 for completeness +DT1 = data.table(a = 1, key = 'a') +DT2 = data.table(c = 1, fn = list(quote(5*a)), key = 'c') +test(1389.2, DT1[, n:=eval(DT2[a]$fn[[1]], .SD)], data.table(a=1, n=5, key="a")) # Fix for foverlaps() floating point interval (double) types. Should increment them by machine tolerance, not by 1L DT1 = data.table(start=c(0.88), end=c(0.88)) @@ -6656,6 +6706,7 @@ if (test_xts) { setcolorder(dt, c(2, 3, 1)) dt[ , char_col := 'a'] test(1465.17, as.xts(dt), xt, warning = 'columns are not numeric') + if (base::getRversion() < "3.6.0") rm(as.xts) # 890 -- key argument for as.data.table.xts x = xts(1:10, as.Date(1:10, origin = "1970-01-01")) @@ -6666,6 +6717,10 @@ if (test_xts) { " 6: 1970-01-07 6", " 7: 1970-01-08 7", " 8: 1970-01-09 8", " 9: 1970-01-10 9", "10: 1970-01-11 10")) options(old) + + # as.data.table.xts(foo) had incorrect integer index with a column name called 'x', #4897 + M = xts::as.xts(matrix(1, dimnames=list("2021-05-23", "x"))) # xts:: just to be extra robust; shouldn't be needed with rm(as.xts) above + test(1465.19, inherits(as.data.table(M)$index,"POSIXct")) Sys.setenv("_R_CHECK_LENGTH_1_LOGIC2_" = TRUE) } @@ -6894,13 +6949,12 @@ test(1486.1, as.data.frame(ans1.1), as.data.frame(ans1.2)) test(1486.2, as.data.frame(ans2.1), as.data.frame(ans2.1)) # Fix for #832 -x <- matrix(1:9, ncol=3) -setattr(x, "names", paste("V", seq_len(length(x)), sep = "")) +x <- matrix(1:9, ncol=3L) +setattr(x, "names", paste0("V", seq_along(x))) test(1487.1, setattr(x, "class", c("data.table", "data.frame")), error="Internal structure doesn't seem to be a list") -x <- matrix(1:9, ncol=3) +x <- matrix(1:9, ncol=3L) class(x) = c("data.table", "data.frame") -# not sure how to test this one, so using `tryCatch` -test(1487.2, tryCatch(print(x), error=function(k) "bla"), "bla") +test(1487.2, print(x), error="dim.data.table expects a data.table as input") # Fix for #1043 DT = data.table(grp=LETTERS[1:2], categ=rep(c("X","Y"), each=2L), condition=rep(c("P","Q"), each=4L), value=sample(8)) @@ -7294,18 +7348,22 @@ test(1530.4, which.last(x), tail(which(x), 1L)) set.seed(2L) x = apply(matrix(sample(letters, 12), nrow=2), 1, paste, collapse="") y = factor(sample(c(letters[1:5], x), 20, TRUE)) -xsub = substring(x, 1L, 1L) -test(1532.1, y %like% xsub[1L], grepl(xsub[1L], y)) -test(1532.2, y %like% xsub[2L], grepl(xsub[2L], y)) -test(1532.3, like(y, xsub[1L]), grepl(xsub[1L], y)) -test(1532.4, like(y, xsub[2L]), grepl(xsub[2L], y)) +xsub = substr(x, 1L, 1L) +test(1532.01, y %like% xsub[1L], grepl(xsub[1L], y)) +test(1532.02, y %like% xsub[2L], grepl(xsub[2L], y)) +test(1532.03, like(y, xsub[1L]), grepl(xsub[1L], y)) +test(1532.04, like(y, xsub[2L]), grepl(xsub[2L], y)) ## %ilike% and %flike% for #3333 x = c('HEY', 'hey', '()') -test(1532.5, like(x, 'hey', ignore.case = TRUE), c(TRUE, TRUE, FALSE)) -test(1532.6, like(x, '()'), c(TRUE, TRUE, TRUE)) -test(1532.7, like(x, '()', fixed = TRUE), c(FALSE, FALSE, TRUE)) -test(1532.8, x %ilike% 'hey', c(TRUE, TRUE, FALSE)) -test(1532.9, x %flike% '()', c(FALSE, FALSE, TRUE)) +test(1532.05, like(x, 'hey', ignore.case = TRUE), c(TRUE, TRUE, FALSE)) +test(1532.06, like(x, '()'), c(TRUE, TRUE, TRUE)) +test(1532.07, like(x, '()', fixed = TRUE), c(FALSE, FALSE, TRUE)) +test(1532.08, x %ilike% 'hey', c(TRUE, TRUE, FALSE)) +test(1532.09, x %flike% '()', c(FALSE, FALSE, TRUE)) +## %like% test for ordered factor with NA +x = c("A", "B", "C", NA_character_) +x = ordered(x, levels = rev(x)[-1L]) +test(1532.10, x %like% "A", c(TRUE, FALSE, FALSE, FALSE)) # coverage for setkey() to 100% dt1 = data.table(x=sample(5), y=1:5, key="y") @@ -8314,10 +8372,18 @@ DT2 = data.table(id1=c("c", "w", "b"), val=50:52) test(1600.2, names(DT1[DT2, .(id1=id1, val=val, bla=sum(z1, na.rm=TRUE)), on="id1"]), c("id1", "val", "bla")) # warn when merge empty data.table #597 -test(1601.1, merge(data.table(a=1),data.table(a=1), by="a"), data.table(a=1, key="a")) -test(1601.2, tryCatch(merge(data.table(a=1),data.table(NULL), by="a"), warning = function(w) w$message), "You are trying to join data.tables where 'y' argument is 0 columns data.table.") -test(1601.3, tryCatch(merge(data.table(NULL),data.table(a=1), by="a"), warning = function(w) w$message), "You are trying to join data.tables where 'x' argument is 0 columns data.table.") -test(1601.4, tryCatch(merge(data.table(NULL),data.table(NULL), by="a"), warning = function(w) w$message), "You are trying to join data.tables where 'x' and 'y' arguments are 0 columns data.table.") +DT0 = data.table(NULL) +DT1 = data.table(a=1) +test(1601.1, merge(DT1, DT1, by="a"), data.table(a=1, key="a")) +test(1601.2, merge(DT1, DT0, by="a"), + warning="You are trying to join data.tables where 'y' has 0 columns.", + error="Elements listed in `by`") +test(1601.3, merge(DT0, DT1, by="a"), + warning="You are trying to join data.tables where 'x' has 0 columns.", + error="Elements listed in `by`") +test(1601.4, merge(DT0, DT0, by="a"), + warning="You are trying to join data.tables where 'x' and 'y' have 0 columns.", + error="Elements listed in `by`") # fix for #1549 d1 <- data.table(v1=1:2,x=x) @@ -8460,17 +8526,17 @@ test(1613.21, all.equal(DT2, DT1, ignore.row.order = TRUE), "Dataset 'current' h # test attributes: key DT1 <- data.table(a = 1:4, b = letters[1:4], key = "a") DT2 <- data.table(a = 1:4, b = letters[1:4]) -test(1613.22, all.equal(DT1, DT2), "Datasets has different keys. 'target': a. 'current' has no key.") +test(1613.22, all.equal(DT1, DT2), "Datasets have different keys. 'target': [a]. 'current': has no key.") test(1613.23, all.equal(DT1, DT2, check.attributes = FALSE), TRUE) test(1613.24, all.equal(DT1, setkeyv(DT2, "a"), check.attributes = TRUE), TRUE) # test attributes: index DT1 <- data.table(a = 1:4, b = letters[1:4]) DT2 <- data.table(a = 1:4, b = letters[1:4]) setindexv(DT1, "b") -test(1613.25, all.equal(DT1, DT2), "Datasets has different indexes. 'target': b. 'current' has no index.") +test(1613.25, all.equal(DT1, DT2), "Datasets have different indices. 'target': [b]. 'current': has no index.") test(1613.26, all.equal(DT1, DT2, check.attributes = FALSE), TRUE) -test(1613.27, all.equal(DT1, setindexv(DT2, "a")), "Datasets has different indexes. 'target': b. 'current': a.") -test(1613.28, all.equal(DT1, setindexv(DT2, "b")), "Datasets has different indexes. 'target': b. 'current': a, b.") +test(1613.27, all.equal(DT1, setindexv(DT2, "a")), "Datasets have different indices. 'target': [b]. 'current': [a].") +test(1613.28, all.equal(DT1, setindexv(DT2, "b")), "Datasets have different indices. 'target': [b]. 'current': [a, b].") test(1613.29, all.equal(DT1, setindexv(setindexv(DT2, NULL), "b")), TRUE) # test custom attribute DT1 <- data.table(a = 1:4, b = letters[1:4]) @@ -8479,7 +8545,7 @@ setattr(DT1, "custom", 1L) test(1613.30, all.equal(DT1, DT2), "Datasets has different number of (non-excluded) attributes: target 3, current 2") test(1613.31, all.equal(DT1, DT2, check.attributes = FALSE), TRUE) setattr(DT2, "custom2", 2L) -test(1613.32, all.equal(DT1, DT2), "Datasets has attributes with different names: custom, custom2") +test(1613.32, all.equal(DT1, DT2), "Datasets has attributes with different names: [custom, custom2]") setattr(DT1, "custom2", 2L) setattr(DT2, "custom", 0L) test(1613.33, all.equal(DT1, DT2), paste0("Attributes: < Component ", dQuote("custom"), ": Mean relative difference: 1 >")) @@ -9500,7 +9566,7 @@ nqjoin_test <- function(x, y, k=1L, test_no, mult="all") { runcmb = as.data.table(runcmb[, 1:min(100L, ncol(runcmb)), drop=FALSE]) # max 100 combinations to test runops = lapply(runcmb, function(cols) { thisops = sample(ops, k, TRUE) - thisops[substring(cols,1,1)=="c"] = "==" + thisops[startsWith(cols, "c")] = "==" thisops }) is_only_na <- function(x) is.na(x) & !is.nan(x) @@ -9948,7 +10014,8 @@ test(1670.2, class(as.data.table(x)), class(x)[2:3]) # #1676, `:=` with by shouldn't add cols on supported types dt = data.table(x=1, y=2) -test(1671, dt[, z := sd, by=x], error="invalid type/length (closure/1)") +test(1671, dt[, z := sd, by=x], + error=gettextf("invalid type/length (%s/%d) in vector allocation", "closure", 1L, domain="R")) # 1683 DT <- data.table(V1 = rep(1:2, 3), V2 = 1:6) @@ -10286,11 +10353,11 @@ if (.Platform$OS.type=="unix") { cat("a,b\n4,2", file=f<-tempfile()) cmd <- sprintf("cat %s", f) options(datatable.fread.input.cmd.message = TRUE) - test(1703.01, fread(cmd), ans<-data.table(a=4L, b=2L), message="Please use fread.cmd=.*security concern.*Please read item 5 in the NEWS file for v1.11.6") + test(1703.01, fread(cmd), ans<-data.table(a=4L, b=2L), message="security concern.*Please read item 5 in the NEWS file for v1.11.6") options(datatable.fread.input.cmd.message = NULL) # when option is missing as it is by default, then TRUE test(1703.02, fread(cmd), ans, message="security concern") options(datatable.fread.input.cmd.message = FALSE) - test(1703.03, tryCatch(fread(cmd), message=stop), ans) + test(1703.03, fread(cmd), ans) options(datatable.fread.input.cmd.message = NULL) test(1703.04, fread(cmd=cmd), ans) test(1703.05, fread(file=cmd), error=sprintf("File '%s' does not exist", cmd)) @@ -10315,7 +10382,8 @@ if (.Platform$OS.type=="unix") { test(1703.15, fread("."), error="File '.' is a directory. Not yet implemented.") # tmpdir argument d = tempfile("dir") -test(1703.16, fread(text=c('a,b','1,2'), tmpdir=d), error="cannot open the connection", warning="No such file or directory") +test(1703.16, fread(text=c('a,b','1,2'), tmpdir=d), + error=base_messages$cant_open_file, warning=base_messages$missing_file) dir.create(d) test(1703.17, fread(text=c('a,b','1,2'), tmpdir=d), data.table(a=1L,b=2L)) test(1703.18, fread(text=c('a,b','1,2')), data.table(a=1L, b=2L)) @@ -10382,8 +10450,8 @@ test(1722.2, DT[,(!is.na(as.numeric(FieldName)))], c(TRUE,TRUE,FALSE,TRUE,FALSE, test(1723.1, DT[removalIndex>0,rowId-(2*removalIndex-1)], c(-2,-11,-5,-14)) test(1723.2, DT[removalIndex>0,(rowId-(2*removalIndex-1))], c(-2,-11,-5,-14)) DT = data.table(FieldName = c("1", "2", "3", "four", "five", "6")) -test(1724.1, DT[, is.na(as.numeric(FieldName))], c(FALSE,FALSE,FALSE,TRUE,TRUE,FALSE), warning="NAs introduced by coercion") -test(1724.2, DT[, !is.na(as.numeric(FieldName))], c(TRUE,TRUE,TRUE,FALSE,FALSE,TRUE), warning="NAs introduced by coercion") +test(1724.1, DT[, is.na(as.numeric(FieldName))], c(FALSE,FALSE,FALSE,TRUE,TRUE,FALSE), warning=base_messages$coerce_na) +test(1724.2, DT[, !is.na(as.numeric(FieldName))], c(TRUE,TRUE,TRUE,FALSE,FALSE,TRUE), warning=base_messages$coerce_na) # Ensure NA's are added properly when a new column is added, not all the target rows are joined to, and the number of i # rows is equal or greater than the number of rows in the target table. @@ -10834,7 +10902,8 @@ test(1743.217, sapply(fread("a,b,c,d,e,f\na,b,c,d,e,f", colClasses = list(factor test(1743.218, sapply(fread("a,b,c,d,e,f\na,b,c,d,e,f", colClasses = list(factor = c(1, 2, 4), factor = 3), select = c(5, 4, 2, 3)), class), y = c(e = "character", d = "factor", b = "factor", c = "factor")) test(1743.22, fread("a,b,c\n1999/01/01,2,f", colClasses=list(Date=1L), drop="a"), data.table(b=2L, c="f")) -test(1743.231, fread("a,b,c\n2,1,4i", colClasses=list(complex="c", integer=2L), drop="a"), data.table(b=1L, c="4i"), warning="NAs introduced by coercion.*left as type 'character'") +test(1743.231, fread("a,b,c\n2,1,4i", colClasses=list(complex="c", integer=2L), drop="a"), data.table(b=1L, c="4i"), + warning=paste0(base_messages$coerce_na, ".*left as type 'character'")) test(1743.232, fread("a,b,c\n2,1,3+4i", colClasses=list(complex="c", integer=2L), drop="a"), data.table(b=1L, c=3+4i)) test(1743.241, fread("a,b,c\n2,2,f", colClasses = list(character="c", integer="b"), drop="a"), data.table(b=2L, c="f")) test(1743.242, fread("a,b,c\n2,2,f", colClasses = c("integer", "integer", "factor"), drop="a"), data.table(b=2L, c=factor("f"))) @@ -10874,7 +10943,9 @@ test(1743.308, fread(data1743, colClasses=list(NULL=c("C","D")), drop=1:2), data test(1743.311, fread(data1743, colClasses="NULL"), ans<-data.table(A=1:2, B=3:4, C=5:6, D=7:8), warning="colClasses.*quoted.*interpreted as colClasses.*NULL") test(1743.312, fread(data1743, colClasses=character()), ans) test(1743.32, fread("A,B\na,0+1i", colClasses="complex"), data.table(A="a", B=1i), - warning="Column 'A' was requested to be 'complex'.*NAs introduced by coercion.*column has been left as.*character") + warning=paste0("Column 'A' was requested to be 'complex'.*", + base_messages$coerce_na, + ".*column has been left as.*character")) test(1743.33, fread(data1743, colClasses=list("character"=4, "numeric"=c(2,NA,1))), data.table(A=c(1,2), B=c(3,4), C=5:6, D=c("7","8")), warning="colClasses[[2]][2] is NA") test(1743.34, fread(data1743, select=list("character"=4, "numeric"=c(2,NA,1))), data.table(D=c("7","8"), B=c(3,4), A=c(1,2)), warning="colClasses[[2]][2] is NA") old = options(warn=2) @@ -11009,7 +11080,7 @@ test(1750.10, # groupingsets on aggregate using grouping col char type and sum - error test(1750.11, groupingsets(dt, j = lapply(.SD, sum), by = c("status","year"), sets=list(character()), .SDcols="color"), - error = "invalid 'type' (character) of argument" + error=base_messages$invalid_arg_sum("character") ) # groupingsets on aggregate using grouping col factor type and sum - error test(1750.12, @@ -11059,9 +11130,9 @@ test(1750.19, uniqueN({ ), 1L, warning = "'sets' contains a duplicate") # entries in `by` / `sets` not exists in data.table test(1750.20, exists("notexist"), FALSE) # https://github.com/Rdatatable/data.table/issues/3055#issuecomment-423364960 -test(1750.21, groupingsets(dt, j = c(list(cnt=.N), lapply(.SD, sum)), by = c("color","year","notexist"), sets=list(c("color"), character()), id=TRUE), error = "object 'notexist' not found") +test(1750.21, groupingsets(dt, j = c(list(cnt=.N), lapply(.SD, sum)), by = c("color","year","notexist"), sets=list(c("color"), character()), id=TRUE), error=base_messages$missing_object("notexist")) test(1750.22, groupingsets(dt, j = c(list(cnt=.N), lapply(.SD, sum)), by = c("color","year","status"), sets=list(c("color"), "stat"), id=TRUE), error = "Columns used in 'sets' but not present in 'by': [stat]") -test(1750.23, groupingsets(dt, j = .(a=sum(notexist)), by = c("color","year","status"), sets=list(c("color"), character()), id=TRUE), error = "object 'notexist' not found") +test(1750.23, groupingsets(dt, j = .(a=sum(notexist)), by = c("color","year","status"), sets=list(c("color"), character()), id=TRUE), error=base_messages$missing_object("notexist")) # update by ref `:=` forbidden test(1750.24, groupingsets(dt, j = sum_value := sum(value), by = c("color","year","status"), sets=list(c("color"), character())), @@ -11388,16 +11459,18 @@ if (exists("B")) rm(B) if (exists("NOTEXIST")) rm(NOTEXIST) if (exists("MyCol")) rm(MyCol) DT <- data.table(A = c(FALSE, TRUE), B = 2:1, C=c(2,3), MyCol=c(2,2)) -test(1773.01, DT[A], error = "A is not found in calling scope but it is a column of type logical.*==TRUE.*When the first argument") -test(1773.02, DT[B], error = "B is not found in calling scope but it is a column of type integer.*DT\\[\\(col\\)\\].*When the first argument") # 697 -test(1773.03, DT[C], error = "i has evaluated to type closure. Expecting logical, integer or double") # C picks up stats::C in calling scope -test(1773.04, DT[MyCol], error="MyCol is not found in calling scope but it is a column of type double.*DT\\[\\(col\\)\\].*When the first argument") -test(1773.05, DT[NOTEXIST], error = "NOTEXIST is not found in calling scope and it is not a column name either. When the first argument") +test(1773.01, DT[A], error = "'A' is not found in calling scope, but it is a column of type logical.*==TRUE.*When the first argument") +test(1773.02, DT[B], error = "'B' is not found in calling scope, but it is a column of type integer.*DT\\[\\(col\\)\\].*When the first argument") # 697 +test(1773.03, DT[C], error = "'C' is not found in calling scope, but it is a column of type double") # C picks up stats::C in calling scope +test(1773.04, DT[MyCol], error="'MyCol' is not found in calling scope, but it is a column of type double.*DT\\[\\(col\\)\\].*When the first argument") +test(1773.05, DT[NOTEXIST], error = "'NOTEXIST' is not found in calling scope and it is not a column name either. When the first argument") test(1773.06, DT[(A)], DT[2]) test(1773.07, DT[A==TRUE], DT[2]) test(1773.08, DT[(B)], data.table(A=c(TRUE,FALSE), B=1:2, C=c(3,2), MyCol=2)) test(1773.09, DT[(MyCol)], data.table(A=c(TRUE,TRUE), B=INT(1,1), C=c(3,3), MyCol=2)) test(1773.10, DT[(C)], data.table(A=c(TRUE,NA), B=c(1L,NA), C=c(3,NA), MyCol=c(2,NA))) +test(1773.11, data.table(subset=c(TRUE,FALSE))[subset], # i being a function name that's also a column name, #5014 + error="'subset' is not found in calling scope, but") # New as.data.table.array method in v1.10.5 set.seed(1L) @@ -13026,11 +13099,11 @@ test(1923.2, indices(DT, vectors=TRUE), list(c("V1"))) DT = data.table(varname = 1) test(1924.1, DT[var_name==1], error='not found\\. Perhaps you intended.*varname') test(1924.2, DT[variable==1], error='Object.*not found among') -test(1924.3, DT[varname+'a'], error='non-numeric argument') +test(1924.3, DT[varname+'a'], error=base_messages$invalid_arg_binary_operator) DT[, VAR_NAME:=2] -test(1924.4, DT[var_name==1], error="Object 'var_name' not found. Perhaps you intended varname, VAR_NAME") +test(1924.4, DT[var_name==1], error="Object 'var_name' not found. Perhaps you intended [varname, VAR_NAME]") DT = setDT(lapply(integer(50), function(...) numeric(1L))) -test(1924.5, DT[V==0], error='Perhaps you intended.*V1.*V5 or 45 more') +test(1924.5, DT[V==0], error='Perhaps you intended.*V1.*V10, [.]{3}') # test suite of as.ITime methods (subsumes #2870) s = c('1970-01-01 00:00:00.1234', '2005-10-12 09:45:32.84') @@ -13184,15 +13257,15 @@ test(1948.09, DT[i, on = eval(eval("id<=idi"))], DT[i, on = "id<=idi"]) test(1948.10, DT[i, on = ""], error = "'on' contains no column name: . Each 'on' clause must contain one or two column names.") test(1948.11, DT[i, on = "id>=idi>=1"], error = "Found more than one operator in one 'on' statement: id>=idi>=1. Please specify a single operator.") test(1948.12, DT[i, on = "`id``idi`<=id"], error = "'on' contains more than 2 column names: `id``idi`<=id. Each 'on' clause must contain one or two column names.") -test(1948.13, DT[i, on = "id != idi"], error = "Invalid operators !=. Only allowed operators are ==<=<>=>.") +test(1948.13, DT[i, on = "id != idi"], error = "Invalid join operators [!=]. Only allowed operators are [==, <=, <, >=, >].") test(1948.14, DT[i, on = 1L], error = "'on' argument should be a named atomic vector of column names indicating which columns in 'i' should be joined with which columns in 'x'.") # helpful error when on= is provided but not i, rather than silently ignoring on= DT = data.table(A=1:3) -test(1949.1, DT[,,on=A], error="object 'A' not found") # tests .1 to .4 amended after #3621 -test(1949.2, DT[,1,on=A], error="object 'A' not found") -test(1949.3, DT[on=A], error="object 'A' not found") -test(1949.4, DT[,on=A], error="object 'A' not found") +test(1949.1, DT[,,on=A], error=base_messages$missing_object("A")) # tests .1 to .4 amended after #3621 +test(1949.2, DT[,1,on=A], error=base_messages$missing_object("A")) +test(1949.3, DT[on=A], error=base_messages$missing_object("A")) +test(1949.4, DT[,on=A], error=base_messages$missing_object("A")) test(1949.5, DT[1,,with=FALSE], error="j must be provided when with=FALSE") test(1949.6, DT[], output="A.*1.*2.*3") # no error test(1949.7, DT[,], output="A.*1.*2.*3") # no error, #3163 @@ -13259,14 +13332,17 @@ test(1957.3, fread("A,B\na,b\nc,d\n", stringsAsFactors=TRUE, verbose=TRUE), data output="stringsAsFactors=TRUE converted 2 column(s): [A, B]") # misc. coverage tests in fread -test(1958.1, fread('\U0001f64d', encoding = 'UTF-16'), error = "Argument 'encoding' must be") -test(1958.2, fread('a,b\n1,2', nrows = NA_real_), data.table(a = 1L, b = 2L)) -test(1958.3, fread('a,b\n1,2', nrows = -1), data.table(a = 1L, b = 2L)) -test(1958.4, fread('a,b\n1,2', key = 1), error = 'must be a character vector naming columns') -test(1958.5, fread("A,B,C\n1,2,3\n3,4,5\n0,0,0\n", nrows=0), data.table(A=logical(), B=logical(), C=logical())) #2747 -test(1958.6, fread("A,B,C\n1,2,3\n3,4,5\n0,0,100\n", nrows=0, sep=','), data.table(A=logical(), B=logical(), C=logical())) -test(1958.7, fread('A,B,C,D\n"a,b",4,5,6\n"c,d",6,7\n', fill=TRUE), data.table(A=c("a,b","c,d"), B=INT(4,6), C=INT(5,7), D=INT(6,NA))) # 2547 -test(1958.8, fread('A,B,C,D\n"a,b",4,5\n"c,d",6,7,8\n', fill=TRUE), data.table(A=c("a,b","c,d"), B=INT(4,6), C=INT(5,7), D=INT(NA,8))) +test(1958.01, fread('\U0001f64d', encoding = 'UTF-16'), error = "Argument 'encoding' must be") +test(1958.02, fread('a,b\n1,2', nrows = NA_real_), data.table(a = 1L, b = 2L)) +test(1958.03, fread('a,b\n1,2', nrows = -1), data.table(a = 1L, b = 2L)) +test(1958.04, fread('a,b\n1,2', key = 1), error = 'must be a character vector naming columns') +test(1958.05, fread("A,B,C\n1,2,3\n3,4,5\n0,0,0\n", nrows=0), data.table(A=logical(), B=logical(), C=logical())) #2747 +test(1958.06, fread("A,B,C\n1,2,3\n3,4,5\n0,0,100\n", nrows=0, sep=','), data.table(A=logical(), B=logical(), C=logical())) +test(1958.07, fread('A,B,C,D\n"a,b",4,5,6\n"c,d",6,7\n', fill=TRUE), data.table(A=c("a,b","c,d"), B=INT(4,6), C=INT(5,7), D=INT(6,NA))) # 2547 +test(1958.08, fread('A,B,C,D\n"a,b",4,5\n"c,d",6,7,8\n', fill=TRUE), data.table(A=c("a,b","c,d"), B=INT(4,6), C=INT(5,7), D=INT(NA,8))) +# 4686 +test(1958.09, fread("A,B,C\n1,2,3\n3,4,5\n0,0,0\n", nrows=0L), data.table(A=logical(), B=logical(), C=logical())) +test(1958.10, fread("A,B,C\n1,2,3\n3,4,5\n0,0,100\n", nrows=0L, sep=','), data.table(A=logical(), B=logical(), C=logical())) # Skip should work with all types of newlines #3006 eols = c("\n", "\r\n", "\r", "\n\r") @@ -13332,8 +13408,7 @@ test(1962.004, duplicated(DT, by = -1L), error = 'specify non existing column*.*-1') test(1962.005, duplicated(DT, by = 'y'), error = 'specify non existing column*.*y') -test(1962.0061, duplicated(data.table(NULL)), logical(0L)) -test(1962.0062, duplicated(data.table(a = 1L), by = character()), logical()) +test(1962.006, duplicated(data.table(NULL)), logical(0L)) test(1962.007, unique(DT, incomparables = TRUE), error = 'not used (yet)') @@ -13751,7 +13826,7 @@ test(1967.34, data.table(1:5, NULL), data.table(V1=1:5)) ### if (novname[i]) vnames[[i]] = namesi ### but, on pause for now pending #3193 ### test(1967.35, data.table(1:5, matrix(6:15, nrow = 5L)) -test(1967.35, data.table(1:5, integer(0L)), data.table(1:5, NA_integer_), warning="Item 2 has 0 rows but longest item has 5; filled with NA") +test(1967.35, data.table(1:5, integer(0L)), data.table(integer(0L), integer(0L))) # no longer NA-fill zero-length, PR#4262 test(1967.36, data.table(1:5, key = 5L), error = 'must be character') x = data.table(a = 1:5) @@ -13773,12 +13848,12 @@ test(1967.49, x[ , list(5) := 6], error = 'LHS of := must be a symbol') test(1967.50, x[ , 1 + 3i := 6], error = "LHS of := isn't column names") test(1967.511, x[ , .(5L), by = .EACHI, mult = 'all'], error='logical error. i is not data.table') test(1967.512, x[1+3i], error='i has evaluated to type complex. Expecting logical, integer or double') -test(1967.521, x[1:2, by=a], x[1:2,], warning="Ignoring by= because j= is not supplied") -test(1967.522, x[, by=a], x, warning=c("Ignoring by= because j= is not supplied","i and j are both missing.*upgraded to error in future")) -test(1967.523, x[by=a], x, warning=c("Ignoring by= because j= is not supplied","i and j are both missing.*upgraded to error in future")) -test(1967.524, x[1:2, keyby=a], x[1:2,], warning="Ignoring keyby= because j= is not supplied") -test(1967.525, x[, keyby=a], x, warning=c("Ignoring keyby= because j= is not supplied","i and j are both missing.*upgraded to error in future")) -test(1967.526, x[keyby=a], x, warning=c("Ignoring keyby= because j= is not supplied","i and j are both missing.*upgraded to error in future")) +test(1967.521, x[1:2, by=a], x[1:2,], warning="Ignoring by/keyby because 'j' is not supplied") +test(1967.522, x[, by=a], x, warning=c("Ignoring by/keyby because 'j' is not supplied","i and j are both missing.*upgraded to error in future")) +test(1967.523, x[by=a], x, warning=c("Ignoring by/keyby because 'j' is not supplied","i and j are both missing.*upgraded to error in future")) +test(1967.524, x[1:2, keyby=a], x[1:2,], warning="Ignoring by/keyby because 'j' is not supplied") +test(1967.525, x[, keyby=a], x, warning=c("Ignoring by/keyby because 'j' is not supplied","i and j are both missing.*upgraded to error in future")) +test(1967.526, x[keyby=a], x, warning=c("Ignoring by/keyby because 'j' is not supplied","i and j are both missing.*upgraded to error in future")) test(1967.53, as.matrix(x, rownames = 2:3), error='length(rownames)==2 but') test(1967.54, as.matrix(x[0L]), structure(integer(0), .Dim = c(0L, 2L), .Dimnames = list(NULL, c("a", "b")))) @@ -13793,7 +13868,7 @@ test(1967.57, setnames(x), error = 'x has 2 columns but its names are length 0') names(x) = c('a', 'b') test(1967.58, names(setnames(x, new = c('b', 'c'))), c('b', 'c')) test(1967.59, setnames(x, 1:2, c(8L, 9L)), error = "'new' is not a character") -test(1967.60, setnames(x, -1:1, c('hey', 'you')), error = "mixed.*negative") +test(1967.60, setnames(x, -1:1, c('hey', 'you')), error = base_messages$mixed_subscripts) test(1967.61, setnames(x, 1+3i, 'cplx'), error = "'old' is type complex") test(1967.62, setnames(x, 1, c('d', 'e')), error = "'old' is length 1 but 'new'") test(1967.621, setnames(x, 1:2, c("a","a")), data.table(a=1:5, a=6:10)) @@ -13839,7 +13914,7 @@ test(1967.75, x[!y, sum(i4), on = 'i1', by = .EACHI, verbose = TRUE], data.table(i1 = c(169L, 369L), V1 = c(270L, 179L)), output = "not-join called with 'by=.EACHI'.*done") test(1967.76, x[!y, sum(i4), on = 'i1', verbose = TRUE], 510L, - output = 'Inverting irows for notjoin.*sec') + output = 'Inverting irows for notjoin.*[0-9]s') x[ , v := 0] ### hitting by = A:B branch test(1967.77, x[ , .(v = sum(v)), by = i1:i4], x[-10L]) @@ -14012,7 +14087,9 @@ test(1984.05, DT[ , sum(b), keyby = c, verbose = TRUE], ### hitting byval = eval(bysub, setattr(as.list(seq_along(xss)), ...) test(1984.06, DT[1:3, sum(a), by=b:c], data.table(b=10:8, c=1:3, V1=1:3)) test(1984.07, DT[, sum(a), by=call('sin',pi)], error='must evaluate to a vector or a list of vectors') -test(1984.08, DT[, sum(a), by=as.raw(0)], error='column or expression.*type raw') +test(1984.081, DT[, sum(a), by=as.raw(0)], error="Column or expression.*1.*type 'raw'.*not.*supported") +test(1984.082, data.table(A=1:4, L=list(1, 1:2, 1, 1:3), V=1:4)[, sum(V), by=.(A,L)], # better error message, 4308 + error="Column or expression.*2.*type 'list'.*not.*supported") test(1984.09, DT[, sum(a), by=.(1,1:2)], error='The items.*list are length[(]s[)] [(]1,2[)].*Each must be length 10; .*rows in x.*after subsetting') options('datatable.optimize' = Inf) test(1984.10, DT[ , 1, by = .(a %% 2), verbose = TRUE], @@ -15067,6 +15144,8 @@ test(2041.1, DT[, median(date), by=g], data.table(g=c("a","b"), V1=as.Date(c("20 test(2041.2, DT[, median(time), by=g], DT[c(2,5),.(g=g, V1=time)]) # 'invalid trim argument' with optimization level 1; #1876 +# these tests check via output= that level 1 is on, and also that level 2 is on (which includes level 1). +# They could run in level 1 with level 2 off, but output= would need to be changed and there's no need. test(2042.1, DT[ , as.character(mean(date)), by=g, verbose=TRUE ], data.table(g=c("a","b"), V1=c("2018-01-04","2018-01-21")), output=msg<-"GForce is on, left j unchanged.*Old mean optimization is on, left j unchanged") @@ -15075,7 +15154,19 @@ test(2042.1, DT[ , as.character(mean(date)), by=g, verbose=TRUE ], Jan.2018 = format(strptime("2018-01-01", "%Y-%m-%d"), "%b-%Y") test(2042.2, DT[ , format(mean(date),"%b-%Y")], Jan.2018) test(2042.3, DT[ , format(mean(date),"%b-%Y"), by=g, verbose=TRUE ], # just this case generated the error - data.table(g=c("a","b"), V1=c(Jan.2018, Jan.2018)), output=msg) + data.table(g=c("a","b"), V1=c(Jan.2018, Jan.2018)), output=msg) +# also incidentally fixed #2491 +DT = data.table( + Group = c("A", "A", "B", "B", "C", "C"), + Date1 = `class<-`(c(17446.0291040738, 17470.0221205444, 17445.0765226481, # `class<-`() == .Date() to pass on R 3.1.0 + 17456.0360002079, 17440.0230725919, 17451.0572453837), "Date"), + Date2 = `class<-`(c(17459.1561177987, 17451.1086757995, 17449.0820898537, + 17443.1175238448, 17461.0463715783, 17448.1033968224), "Date") +) +DT[ , DiffTime := abs(difftime(Date1, Date2, units = 'days'))] +test(2042.4, DT[ , round(mean(DiffTime)), by=Group, verbose=TRUE], + data.table(Group=c("A", "B", "C"), V1=structure(c(16, 8, 12), class="difftime", units="days")), + output="Old mean optimization is on, left j unchanged.*GForce.*FALSE") # gforce wrongly applied to external variable; #875 DT = data.table(x=INT(1,1,1,2,2), y=1:5) @@ -15310,6 +15401,19 @@ options(old) test(2049.2, outer$ab, list(data.table(a=1:3, b=4L))) test(2049.3, outer$ab[[1]][, b := 5L], data.table(a=1:3, b=5L)) test(2049.4, outer$ab, list(data.table(a=1:3, b=5L))) +test(2049.5, {DT=data.table(d=list(data.table(a=1))); DT$d[[1]][, new_col:=NA]; DT}, # verbatim from #1629 + data.table(d = list(data.table(a=1, new_col=NA)))) +# extra tests on similar theme to #1629 added in PR#4366 ... +add_col1 = function(dt) { + if (is.data.table(dt)) dt[, new_col:=NA] + if (is.list(dt)) lapply(dt, add_col1) + invisible() +} +DT = data.table(a=c(1,2), b=list(data.table(d=c("a", "b"), e=c(100, 200)))) +test(2049.6, add_col1(DT), NULL) +test(2049.7, names(DT), c("a","b","new_col")) +test(2049.8, names(DT$b[[1L]]), c("d","e","new_col")) +test(2049.9, names(DT$b[[2L]]), c("d","e","new_col")) # rbindlist zero row DT should retain its (unused) levels, #3508 DT = data.table(f = factor(c("a", "b", "c"))) @@ -15613,7 +15717,7 @@ DT <- data.table( f_1 = factor(c('a', 'c', 'b', NA, 'c', 'b', 'c', 'c', NA, 'c', NA, 'c', 'a', 'b', NA, NA, NA, 'a')), c_1 = c("a", "c", NA, NA, NA, "c", "b", NA, "a", "b", NA, "a", "c", "b", "c", "b", "a", "b") ) -test(2063.1, melt(DT, id=1:2, measure=3:4), melt(DT, id=c("i_1", "i_2"), measure=c("f_1", "c_1"))) +test(2063.1, melt(DT, id=1:2, measure.vars=3:4), melt(DT, id=c("i_1", "i_2"), measure.vars=c("f_1", "c_1"))) ## fun --> fun.aggregate DT = melt(as.data.table(ChickWeight), id.vars=2:4) setnames(DT, tolower(names(DT))) @@ -15797,7 +15901,7 @@ test(2072.009, fifelse(test_vec, rep(1L,11L), rep(0L,10L)), error="Length o test(2072.010, fifelse(test_vec, rep(1,10L), rep(0,11L)), error="Length of 'yes' is 10 but must be 1 or length of 'test' (11).") test(2072.011, fifelse(test_vec, rep(TRUE,10L), rep(FALSE,10L)), error="Length of 'yes' is 10 but must be 1 or length of 'test' (11).") test(2072.012, fifelse(0:1, rep(TRUE,2L), rep(FALSE,2L)), error="Argument 'test' must be logical.") -test(2072.013, fifelse(test_vec, TRUE, "FALSE"), error="'yes' is of type logical but 'no' is of type character. Please") +test(2072.013, fifelse(test_vec, TRUE, "FALSE"), error="'no' is of type character but 'yes' is logical. Please") test(2072.014, fifelse(test_vec, list(1),list(2,4)), error="Length of 'no' is 2 but must be 1 or length of 'test' (11).") test(2072.015, fifelse(test_vec, list(1,3),list(2,4)), error="Length of 'yes' is 2 but must be 1 or length of 'test' (11).") test(2072.016, fifelse(test_vec, list(1), list(0)), as.list(as.numeric(out_vec))) @@ -15823,7 +15927,7 @@ test(2072.031, fifelse(test_vec_na, "1", rep("0",12L)), as.character(out_vec_na) test(2072.032, fifelse(test_vec_na, rep("1",12L), "0"), as.character(out_vec_na)) test(2072.033, fifelse(test_vec_na, rep("1",12L), rep("0",12L)), as.character(out_vec_na)) test(2072.034, fifelse(test_vec_na, "1", "0"), as.character(out_vec_na)) -test(2072.035, fifelse(test_vec, as.Date("2011-01-01"), FALSE), error="'yes' is of type double but 'no' is of type logical. Please") +test(2072.035, fifelse(test_vec, as.Date("2011-01-01"), FALSE), error="'no' is of type logical but 'yes' is double. Please") test(2072.036, fifelse(test_vec_na, 1+0i, 0+0i), as.complex(out_vec_na)) test(2072.037, fifelse(test_vec_na, rep(1+0i,12L), 0+0i), as.complex(out_vec_na)) test(2072.038, fifelse(test_vec_na, rep(1+0i,12L), rep(0+0i,12L)), as.complex(out_vec_na)) @@ -16260,7 +16364,7 @@ test(2100.03, fifelse(test_vec_na, TRUE, FALSE, TRUE), as.logical(out_vec_na)) test(2100.04, fifelse(test_vec_na, "1", "0","2"), as.character(out_vec_na)) test(2100.05, fifelse(test_vec_na, 1+0i, 0+0i, 2+0i), as.complex(out_vec_na)) test(2100.06, fifelse(c(TRUE,FALSE,NA), list(1:5), list(5:1), list(15:11)), list(1:5,5:1,15:11)) -test(2100.07, fifelse(test_vec_na, 1, 0, 2L), error = "'yes' is of type double but 'na' is of type integer. Please make sure that both arguments have the same type.") +test(2100.07, fifelse(test_vec_na, 1, 0, 2L), c(1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 2)) # corece na test(2100.08, fifelse(test_vec_na, 1, 0, c(2,3)), error = "Length of 'na' is 2 but must be 1") test(2100.09, fifelse(date_vec_na, as.Date("2019-08-31"), as.Date("2019-08-30"), as.Date("2019-08-29")), as.Date(c(18139, 18138, 18138, 18138, 18138, 18137), origin = '1970-01-01')) test(2100.10, fifelse(date_vec_na, as.Date("2019-08-31"), as.Date("2019-08-30"), 18137), error = "'yes' has different class than 'na'. Please make sure that both arguments have the same class.") @@ -16609,7 +16713,7 @@ set.seed(1) vDT = data.table(i_id = unique(iDT$i_id))[, .(v = runif(5,0,10), p = sample(c(5,5,10,10,10))), by=i_id] test(2120.01, !exists("i_id")) # quick verify in case there's an i_id in .GlobalEnv when testing in dev test(2120.02, iDT[i_id, order(e_date, e_time)], # first of all, the correct error - error="i_id is not found in calling scope but it is a column of type character") + error="'i_id' is not found in calling scope, but it is a column of type character") tmp = vDT[c("B","C","A"), on=.(i_id), .N, by=.EACHI] # split long statement in 2120.05 up as per demo in #3669 test(2120.03, tmp, data.table(i_id=c("B","C","A"), N=5L)) # just make sure the helper tmp is correct test(2120.04, tmp[iDT[i_id, order(e_date, e_time)]], # i_id obtained from tmp; this is what broke in dev 1.12.3 @@ -16651,20 +16755,20 @@ t0 = as.POSIXct('2019-10-01') test(2124.1, format(as.ITime(t0)), '00:00:00') test(2124.2, format(as.IDate(t0)), '2019-10-01') if (is.na(oldtz)) Sys.unsetenv("TZ") else Sys.setenv(TZ=oldtz) -# careful to unset because TZ="" means UTC whereas unset TZ means local +# careful to unset because TZ="" means UTC whereas unset TZ means local, #4261 and #4464 # trunc.cols in print.data.table, #4074 -old_width = options("width" = 40) +old_width = options("width" = 40L) # Single row printing (to check issue with losing attributes) DT = data.table(a = "aaaaaaaaaaaaa", b = "bbbbbbbbbbbbb", c = "ccccccccccccc", d = "ddddddddddddd") test(2125.01, - capture.output(print(DT, trunc.cols=TRUE))[3], + capture.output(print(DT, trunc.cols=TRUE))[3L], "2 variables not shown: [c, d]") # Printing with dots -DT = data.table(a = vector("integer", 102), +DT = data.table(a = vector("integer", 102L), b = "bbbbbbbbbbbbb", c = "ccccccccccccc", d = c("ddddddddddddd", "d")) @@ -16696,8 +16800,12 @@ test(2125.03, capture.output(print(DT, trunc.cols=TRUE, row.names=FALSE)), " 0 bbbbbbbbbbbbb ccccccccccccc", " 0 bbbbbbbbbbbbb ccccccccccccc", "1 variable not shown: [d]" )) -test(2125.04, capture.output(print(DT, trunc.cols=TRUE, class=TRUE))[14], - "1 variable not shown: [d ]") +# also testing #4266 -- getting width of row #s register right +# TODO: understand why 2 variables truncated here. a,b,c combined have width +# _exactly_ 40, but still wraps. If we set options(width=41) it won't truncate. +# seems to be an issue with print.default. +test(2125.04, capture.output(print(DT, trunc.cols=TRUE, class=TRUE))[14L], + "2 variables not shown: [c , d ]") test(2125.05, capture.output(print(DT, trunc.cols=TRUE, class=TRUE, row.names=FALSE))[c(1,14)], c(" a b c", "1 variable not shown: [d ]" )) @@ -16705,8 +16813,8 @@ test(2125.06, capture.output(print(DT, trunc.cols=TRUE, col.names="none"))[c(1,1 c(" 1: 0 bbbbbbbbbbbbb ccccccccccccc", "1 variable not shown: [d]" )) test(2125.07, capture.output(print(DT, trunc.cols=TRUE, class=TRUE, col.names="none"))[c(1,13)], - c(" 1: 0 bbbbbbbbbbbbb ccccccccccccc", - "1 variable not shown: [d]" ), + c(" 1: 0 bbbbbbbbbbbbb", + "2 variables not shown: [c, d]" ), warning = "Column classes will be suppressed when col.names is 'none'") options("width" = 20) DT = data.table(a = vector("integer", 2), @@ -16896,7 +17004,6 @@ test(2132.2, fifelse(TRUE, 1, s2), error = "S4 class objects (except nanot test(2132.3, fcase(TRUE, s1, FALSE, s2), error = "S4 class objects (except nanotime) are not supported. Please see") test(2132.4, fcase(FALSE, 1, TRUE, s1), error = "S4 class objects (except nanotime) are not supported. Please see") rm(s1, s2, class2132) - if (test_xts) { # keep.rownames in as.data.table.xts() supports a string, #4232 xts = xts::xts(1:10, structure(1:10, class = "Date")) @@ -17124,6 +17231,10 @@ if (TZnotUTC) { test(2150.20, fread("a,b,c\n2015-01-01,2015-01-02,2015-01-03 01:02:03", colClasses=c("Date",NA,NA)), ans, output=ans_print) } +# fread single row single column datetime field, #2609 +test(2150.21, fread("c1\n2018-01-31 03:16:57"), data.table(V1=as.IDate("2018-01-31"), c1="03:16:57"), + warning="Detected 1 column names but the data has 2 columns") +test(2150.22, fread("c1\n2018-01-31 03:16:57", sep=""), data.table(c1=as.POSIXct("2018-01-31 03:16:57", tz="UTC"))) options(old) # 1 is treated as . in dcast formula, #4615 @@ -17191,11 +17302,11 @@ test(2158.2, DT[, by="index", list(value=list(value))], DT = data.table(x = 1) test(2159.01, typeof(as.matrix(DT)), "double") test(2159.02, typeof(as.matrix(DT[0L])), "double") -test(2159.03, min(DT[0L]), Inf, warning="missing") # R's warning message; use one word 'missing' to insulate from possible future changes to R's message +test(2159.03, min(DT[0L]), Inf, warning=base_messages$empty_min) DT = data.table(x = 1L) test(2159.04, typeof(as.matrix(DT)), "integer") test(2159.05, typeof(as.matrix(DT[0L])), "integer") -test(2159.06, min(DT[0L]), Inf, warning="missing") +test(2159.06, min(DT[0L]), Inf, warning=base_messages$empty_min) DT = data.table(x = TRUE) test(2159.07, typeof(as.matrix(DT)), "logical") test(2159.08, typeof(as.matrix(DT[0L])), "logical") @@ -17273,14 +17384,388 @@ if (test_bit64) { test(2164.3, d[, mean(b, na.rm=TRUE), by=a], data.table(a=INT(1,2), V1=c(2.5, 4))) } -# Test new feature %notin%, #4152 -test(2165.1, 11 %notin% 1:10, TRUE) -test(2165.2, "a" %notin% c(), TRUE) -test(2165.3, "a" %notin% c("a", "b", "c"), FALSE) -test(2165.4, c(1, 2) %notin% c(1,2,3), c(FALSE, FALSE)) -test(2165.5, "a" %notin% character(), TRUE) -test(2165.6, "a" %notin% integer(), TRUE) -test(2165.7, "a" %notin% NULL, TRUE) -test(2165.8, NA %notin% 1:5, TRUE) -test(2165.9, NA %notin% c(1:5, NA), FALSE) +# invalid key when by=.EACHI, haskey(i) but on= non-leading-subset of i's key, #4603 #4911 +X = data.table(id = c(6456372L, 6456372L, 6456372L, 6456372L,6456372L, 6456372L, 6456372L, 6456372L, 6456372L, 6456372L, 6456372L, 6456372L, 6456372L, 6456372L), + id_round = c(197801L, 199405L, 199501L, 197901L, 197905L, 198001L, 198005L, 198101L, 198105L, 198201L, 198205L, 198301L, 198305L, 198401L), + field = c(NA, NA, NA, "medicine", "medicine", "medicine", "medicine", "medicine", "medicine", "medicine", "medicine", "medicine", "medicine", "medicine"), + key = "id") +Y = data.table(id = c(6456372L, 6456345L, 6456356L), + id_round = c(197705L, 197905L, 201705L), + field = c("medicine", "teaching", "health"), + prio = c(6L, 1L, 10L), + key = c("id_round", "id", "prio", "field" )) +test(2165.1, X[Y, on = .(id, id_round > id_round, field), .(x.id_round[1], i.id_round[1]), by=.EACHI][id==6456372L], + data.table(id=6456372L, id_round=197705L, field='medicine', V1=197901L, V2=197705L)) +# Y$id_round happens to be sorted, so in 2165.2 we test Y$field which is not sorted +test(2165.2, X[Y, on="field", .(x.id_round[1]), by=.EACHI][field=="health"], + data.table(field="health", V1=NA_integer_)) +# a minimal example too ... +X = data.table(A=c(4L,2L,3L), B=1:3, key="A") +Y = data.table(A=2:1, B=2:3, key=c("B","A")) +test(2165.3, X[Y], data.table(A=2:3, B=2:3, i.A=2:1, key="A")) # keyed +test(2165.4, X[Y, on=.(A)], data.table(A=2:1, B=c(2L,NA), i.B=2:3)) # no key +test(2165.5, X[Y, on=.(A), x.B, by=.EACHI], data.table(A=2:1, x.B=c(2L,NA))) # no key + +# missing j was caught in groupingsets but not cube, leading to unexpected error message, #4282 +DT = data.table(a=1) +test(2166, cube(DT, by='a'), error="Argument 'j' is required") + +# fwrite support encoding "native" and "UTF-8", #1770 +latin1 = "fa\xE7ile" +Encoding(latin1) = "latin1" +utf8 = iconv(latin1, "latin1", "UTF-8") +text = c(latin1, utf8, "aaaaaaaa") +dt = data.table(A = text, B = as.factor(text)) +dt2 = data.table(A = text, B = text) +csvfile = tempfile(fileext = ".csv") +fwrite(dt, csvfile, encoding = "UTF-8", bom = TRUE) +test(2167.1, fread(csvfile, encoding = "UTF-8"), dt2) +if (identical(text, enc2native(text))) { # ensure native encoding can represent latin1 strings + fwrite(dt, csvfile, encoding = "native") + test(2167.2, fread(csvfile), dt2) +} +test(2167.3, fwrite(dt, csvfile, encoding="nativ"), error="Argument 'encoding' must be") +unlink(csvfile) + +# check valid trunc.cols=, #4766 +DT = data.table(x = rnorm(10)) +test(2168.01, print(DT, trunc.cols = 5L), error=c("Valid options for trunc.cols are TRUE and FALSE")) +test(2168.02, print(DT, trunc.cols = NA), error=c("Valid options for trunc.cols are TRUE and FALSE")) +test(2168.03, print(DT, trunc.cols = "thing"), error=c("Valid options for trunc.cols are TRUE and FALSE")) +test(2168.04, print(DT, trunc.cols = c(TRUE, FALSE)), error=c("Valid options for trunc.cols are TRUE and FALSE")) + +# shallow copy of .SD must be unlocked for frank using na.last=NA or ties.method='random', #4429 +DT = data.table(a=1:10) +test(2169.1, DT[ , frankv(.SD, ties.method='average', na.last=NA)], as.double(1:10)) +test(2169.2, DT[ , frankv(.SD, ties.method='random')], 1:10) +# coverage tests for some issues discovered on the way +DT[, c('..na_prefix..', '..stats_runif..') := 1L] +test(2169.3, DT[ , frankv(.SD, ties.method='average', na.last=NA)], error="Input column '..na_prefix..' conflicts") +test(2169.4, DT[ , frankv(.SD, ties.method='random')], error="Input column '..stats_runif..' conflicts") + +# which=NA inconsistent with ?data.table, #4411 +DT = data.table(A = c(NA, 3, 5, 0, 1, 2), B = c("foo", "foo", "foo", "bar", "bar", "bar")) +test(2170.1, DT[A > 1, which = NA], c(1L,4:5)) +test(2170.2, DT[A > -1, which = NA], 1L) +test(2170.3, DT[A > -1 | is.na(A), which = NA], integer()) +test(2170.4, DT[A > 10, which = NA], seq_len(nrow(DT))) +test(2170.5, DT[!(A > 1), which = NA], c(1:3,6L)) # matches DT[A <= 1, which = NA] + +# data.table() zero-nrow result if any non-null & atomic element is length 0, #3727 +test(2171.1, data.table(A=double(), B=1:2), data.table(A=double(), B=integer())) +DT = data.table(CODE=c('a','b'), DATE=1:2, VALUE=c(1.3, 1.5), key=c('CODE','DATE')) +test(2171.2, DT[J(character(), 1), VALUE], double()) # because "J" is a wrapper of list() +test(2171.3, data.table(A=NULL, B=1.0), data.table(B=1.0)) # NULL is omited +test(2171.4, NROW(data.table(A=list(), B=1.0)), 1L) # empty list() regarded as `list(list())` which is length 1, and recycled +DT = data.table(A=1:3, B=letters[1:3]) +test(2171.5, ans <- DT[A>3, .(ITEM='A>3', A, B)], # now identical as expected + DT[A>3][, .(ITEM='A>3', A, B)]) +test(2171.6, ans, data.table(ITEM=character(), A=integer(), B=character())) # not just identical to each other, but correct too + +# don't remove 'newclass' from jval's result, #4324 +A = data.table(COL = 'dt') +class(A) = c('newclass', class(A)) +DT = data.table(LIST_COL = list(A, A)) +test(2172, class(DT[1, LIST_COL[[1]]]), class(A)) + +# as.data.table.list edits list elements, so must be sure x does not use some other `[[` method, #4526 +x = data.frame(a = 1:5) +x$b = matrix(6:15, ncol=2L) +class(x) = c('foo', 'data.frame') +`[[.foo` = function(x, i) { + if (any(sapply(x, inherits, 'data.table'))) stop('failure') + as.list(x)[[i]] +} +test(2173, as.data.table(x), data.table(a=1:5, b.V1=6:10, b.V2=11:15)) + +# rbind two length-0 ordered factors, #4795 +DT = data.table(A = ordered(character())) +test(2174, rbind(DT, DT), DT) + +## set row.names when a null data.table has a column assigned for the first time, #4597 +DT = data.table() +test(2175.1, attr(DT[, x:=1:5], "row.names"), 1:5) +DT = data.table() +set(DT, j=c("v1","v2"), value=list(1:6, 2:7)) +test(2175.2, attr(DT, "row.names"), 1:6) +DT = data.table(x=integer()) +test(2175.3, DT[, y:=3L], data.table(x=integer(), y=integer())) # in keeping with recent #4262, view as recycling the length-1 3L to match the length-0 data + +# `keyby`=TRUE/FALSE together with by=, #4307 +DT = data.table(a=2:1, b=3:2, d=4:3) +test(2176.1, DT[, .SD, by="a", keyby=FALSE], data.table(a=2:1,b=3:2,d=4:3)) +test(2176.2, DT[, .SD, by="a", keyby=TRUE], data.table(a=1:2,b=2:3,d=3:4, key="a")) + +# check fwrite output using new default separator option, #4956 +DT = data.table(a=1, b=2) +options(datatable.fwrite.sep='\t') +test(2177.01, fwrite(DT), output='a\tb\n1\t2') +options(datatable.fwrite.sep=';') +test(2177.02, fwrite(DT), output='a;b\n1;2') +options(datatable.fwrite.sep=NULL) +test(2177.03, fwrite(DT), output='a,b\n1,2') + +# segfault when joining and grouping and some rows don't match, #4892 +x = data.table(id = 1:4, key = 'id') +y = data.table(id = 2:5, key = 'id') +z = data.table(c=c(2L, 2L, 1L, 1L), id=c(2L, 4L, 3L, NA)) +test(2178, x[y, .SD, by=.(c(2L, 1L, 2L, 1L))], z) + +# assigning all-na length>1 to a factor column was segfault, #4824 +DT = data.table(FACTOR = factor(rep("a", 3L))) +set(DT, i=1:2, j="FACTOR", value=rep(NA, 2L)) +test(2179, DT$FACTOR, factor(c(NA, NA, "a"))) + +# deleting duplicated column name removes only first +DT = data.table(a=1, b=2, a=3) +test(2180, DT[, a:=NULL], data.table(b=2, a=3)) + +# as.data.table(table(NULL)) was error, #4179 +test(2181, as.data.table(table(NULL)), data.table(NULL)) + +# some missing variables in melt, #4027 +DT.wide = data.table(a2=2, b1=1, b2=2) +expected = data.table(variable=factor(1:2), a=c(NA,2), b=c(1,2)) +test(2182.1, melt(DT.wide, measure.vars=list(a=c(NA,1), b=2:3)), expected) +test(2182.2, melt(DT.wide, measure.vars=list(a=c(NA,"a2"), b=c("b1","b2"))), expected) +DTid = data.table(DT.wide, id=1) +exid = data.table(id=1, expected) +test(2182.3, melt(DTid, measure.vars=list(a=c(NA,1), b=2:3), id.vars="id"), exid) +test(2182.4, melt(DTid, measure.vars=list(a=c(NA,"a2"), b=c("b1","b2")), id.vars="id"), exid) +test(2182.5, melt(DT.wide, measure.vars=list(a=c(NA,1), b=2:3), na.rm=TRUE)[, .(a, b)], data.table(a=2, b=2))#not testing variable because it is not computed correctly, #4455 + +### First block testing measurev +# new variable_table attribute for measure.vars, PR#4731 for multiple issues +measurev = function(cols)cols # user-defined function for computing measure.vars, same name as data.table::measure but user-defined version should be used. +test(2183.00001, melt(DT.wide, measure.vars=measurev()), data.table(variable=factor(c("a2","b1","b2")), value=c(2,1,2))) +measurev = list("foo", "bar")#measurev below should not use this since it is not a function. +test(2183.00002, melt(DTid, measure.vars=measurev(list(value.name=NULL, num=as.complex), pattern="([ab])([12])")), error="Type 'complex' not supported for joining/merging") +test(2183.00004, melt(DTid, measure.vars=measurev(list(value.name=NULL, istr=NULL), pattern="([ab])([12])"))[order(b)], data.table(id=1, istr=paste(c(1,2)), a=c(NA, 2), b=c(1,2))) +test(2183.00005, melt(DTid, measure.vars=measurev(list(column=NULL, istr=NULL), pattern="([ab])([12])", multiple.keyword="column"))[order(b)], data.table(id=1, istr=paste(c(1,2)), a=c(NA, 2), b=c(1,2)))#same computation but different multiple.keyword +iris.dt = data.table(datasets::iris) +test(2183.00020, melt(iris.dt, measure.vars=measurev(value.name, dim, sep=".", pattern="foo")), error="both sep and pattern arguments used; must use either sep or pattern (not both)") +test(2183.000201, melt(iris.dt, measure.vars=measurev(list(NULL, dim=NULL), sep=".")), error="in measurev, elements of fun.list must be named, problems: 1") +test(2183.000202, melt(iris.dt, measure.vars=measurev(list(NULL, NULL), sep=".")), error="in measurev, elements of fun.list must be named, problems: 1,2") +test(2183.00027, melt(iris.dt, measure.vars=measurev(list(value.name=NULL, dim="bar"), sep=".")), error="in the measurev fun.list, each non-NULL element must be a function with at least one argument, problem: dim") +test(2183.00028, melt(iris.dt, measure.vars=measurev(list(value.name=NULL, dim=NULL, baz=NULL), sep=".")), error="number of elements of fun.list =3 must be same as max number of items after splitting column names =2") +test(2183.00042, melt(DTid, measure.vars=measurev(list(value.name=NULL, istr=function()1), pattern="([ab])([12])")), error="in the measurev fun.list, each non-NULL element must be a function with at least one argument, problem: istr") +test(2183.00043, melt(DTid, measure.vars=measurev(list(value.name=NULL, istr=interactive), pattern="([ab])([12])")), error="in the measurev fun.list, each non-NULL element must be a function with at least one argument, problem: istr") +test(2183.00044, melt(DTid, measure.vars=measurev(list(value.name=NULL, istr=function(x)1), pattern="([ab])([12])")), error="each conversion function must return an atomic vector with same length as its first argument, problem: istr") +test(2183.00045, melt(iris.dt, measure.vars=measurev(list(value.name=NULL, dim=NULL, baz=NULL), pattern="(.*)[.](.*)")), error="number of elements of fun.list =3 must be same as number of capture groups in pattern =2") +test(2183.00048, melt(iris.dt, measure.vars=measurev(list(value.name=NULL, value.name=NULL), sep=".")), error="elements of fun.list should be uniquely named, problems: value.name") +# measure with factor conversion. +myfac = function(x)factor(x)#user-defined conversion function. +test(2183.00060, melt(DTid, measure.vars=measurev(list(letter=myfac, value.name=NULL), pattern="([ab])([12])")), data.table(id=1, letter=factor(c("a","b")), "2"=c(2,2), "1"=c(NA,1))) + +### Second block testing measure +# new variable_table attribute for measure.vars, PR#4731 for multiple issues +measure = function(cols)cols # user-defined function for computing measure.vars, same name as data.table::measure but user-defined version should be used. +test(2183.01, melt(DT.wide, measure.vars=measure()), data.table(variable=factor(c("a2","b1","b2")), value=c(2,1,2))) +measure = list("foo", "bar")#measure below should not use this since it is not a function. +test(2183.02, melt(DTid, measure.vars=measure(value.name, num=as.complex, pattern="([ab])([12])")), error="Type 'complex' not supported for joining/merging") +test(2183.03, melt(DTid, measure.vars=structure(list(a=c(NA,"a2"),b=c("b1","b2")), variable_table=data.table(number=as.complex(1:2)))), error="variable_table does not support column type 'complex' for column 'number'") +test(2183.04, melt(DTid, measure.vars=measure(value.name, istr, pattern="([ab])([12])"))[order(b)], data.table(id=1, istr=paste(c(1,2)), a=c(NA, 2), b=c(1,2))) +test(2183.05, melt(DTid, measure.vars=measure(column, istr, pattern="([ab])([12])", multiple.keyword="column"))[order(b)], data.table(id=1, istr=paste(c(1,2)), a=c(NA, 2), b=c(1,2)))#same computation but different multiple.keyword +test(2183.06, melt(DTid, measure.vars=structure(list(1, 2), variable_table="foo")), error="variable_table attribute of measure.vars should be either NULL or a data table") +test(2183.07, melt(DTid, measure.vars=structure(1:3, variable_table="foo")), error="variable_table attribute of measure.vars should be either NULL or a data table") +test(2183.08, melt(DTid, measure.vars=structure(1:3, variable_table=data.table())), error="variable_table attribute of measure.vars should be a data table with at least one column") +test(2183.09, melt(DTid, measure.vars=structure(1:3, variable_table=data.table(x=1))), error="variable_table attribute of measure.vars should be a data table with same number of rows as max length of measure.vars vectors =3") +test(2183.10, melt(DTid, measure.vars=structure(list(a=1, b=2:3), variable_table=data.table(x=1))), error="variable_table attribute of measure.vars should be a data table with same number of rows as max length of measure.vars vectors =2") +test(2183.11, melt(DTid, measure.vars=structure(list(a=1, b=2:3), variable_table=list(x=1:2, y=1))), error="variable_table attribute of measure.vars should be a data table with same number of rows as max length of measure.vars vectors =2")#make sure to check each list element, not just the first. +# general measure errors. +iris.dt = data.table(datasets::iris) +test(2183.20, melt(iris.dt, measure.vars=measure(value.name, dim, sep=".", pattern="foo")), error="both sep and pattern arguments used; must use either sep or pattern (not both)") +# school example. +schools.wide <- data.table( + school = c("A","B"), + read_1 = c(1.1,2.1), read_1_sp = c(T,T), + read_2 = c(1.2,2.2), + math_1 = c(10.1,20.1), math_1_sp = c(T,T), + math_2 = c(NA,20.2), math_2_sp = c(NA,F)) +schools.tall <- melt(schools.wide, na.rm=TRUE, measure.vars=measure(subject, number=as.integer, value.name=function(x)ifelse(x=="", "score", "sp"), pattern="([^_]+)_([12])(.*)")) +schools.expected = data.table(school=c("A","B","A","B","B"), subject=c("read","read","math","math","math"), number=as.integer(c(1,1,1,1,2)), score=c(1.1,2.1,10.1,20.1,20.2), sp=c(T,T,T,T,F)) +test(2183.21, schools.tall, schools.expected) +who <- data.table(id=1, new_sp_m5564=2, newrel_f65=3) +test(2183.22, melt(who, measure.vars=measure(diagnosis, gender, ages, ymin=as.numeric, ymax=function(y)ifelse(y=="", Inf, as.numeric(y)), pattern="new_?(?.*)_(?.)(?(?0|[0-9]{2})(?[0-9]{0,2}))")), data.table(id=1, diagnosis=c("sp","rel"), gender=c("m","f"), ages=c("5564","65"), ymin=c(55,65), ymax=c(64,Inf), value=c(2,3))) +wide.again = dcast(schools.tall, school ~ subject + number, value.var = c("score","sp")) +# measure with sep= +test(2183.23, melt(wide.again, na.rm=TRUE, measure.vars=measure(value.name, subject, number=as.integer))[order(score)], schools.expected)#should work without sep due to same default _ as dcast. +test(2183.24, names(melt(iris.dt, measure.vars=measure(value.name, dim, sep="."))), c("Species", "dim", "Sepal", "Petal")) +test(2183.25, names(melt(iris.dt, measure.vars=measure(part, value.name, sep="."))), c("Species", "part", "Length", "Width")) +test(2183.26, names(melt(iris.dt, measure.vars=measure(part, dim, sep="."))), c("Species", "part", "dim", "value")) +test(2183.27, melt(iris.dt, measure.vars=measure(value.name, dim="bar", sep=".")), error="each ... argument to measure must be a function with at least one argument, problem: dim") +test(2183.28, melt(iris.dt, measure.vars=measure(value.name, dim, baz, sep=".")), error="number of ... arguments to measure =3 must be same as max number of items after splitting column names =2") +test(2183.29, melt(iris.dt, measure.vars=measure()), error="each column name results in only one item after splitting using sep, which means that all columns would be melted; to fix please either specify melt on all columns directly without using measure, or use a different sep/pattern specification") +# patterns with iris data. +test(2183.40, names(melt(iris.dt, measure.vars=patterns("[.]"))), c("Species", "variable", "value")) +# measure with pattern= +test(2183.41, melt(DTid, measure.vars=measure(value.name, istr="bar", pattern="([ab])([12])")), error="each ... argument to measure must be a function with at least one argument, problem: istr") +test(2183.42, melt(DTid, measure.vars=measure(value.name, istr=function()1, pattern="([ab])([12])")), error="each ... argument to measure must be a function with at least one argument, problem: istr") +test(2183.43, melt(DTid, measure.vars=measure(value.name, istr=interactive, pattern="([ab])([12])")), error="each ... argument to measure must be a function with at least one argument, problem: istr") +test(2183.44, melt(DTid, measure.vars=measure(value.name, istr=function(x)1, pattern="([ab])([12])")), error="each conversion function must return an atomic vector with same length as its first argument, problem: istr") +test(2183.45, melt(iris.dt, measure.vars=measure(value.name, dim, baz, pattern="(.*)[.](.*)")), error="number of ... arguments to measure =3 must be same as number of capture groups in pattern =2") +test(2183.46, melt(iris.dt, measure.vars=measure(function(x)factor(x), dim, pattern="(.*)[.](.*)")), error="each ... argument to measure must be either a symbol without argument name, or a function with argument name, problems: 1") +test(2183.47, melt(iris.dt, measure.vars=measure(function(x)factor(x), pattern="(.*)[.](.*)")), error="each ... argument to measure must be either a symbol without argument name, or a function with argument name, problems: 1") +test(2183.48, melt(iris.dt, measure.vars=measure(value.name, value.name, sep=".")), error="... arguments to measure should be uniquely named, problems: value.name") +# measure with factor conversion. +myfac = function(x)factor(x)#user-defined conversion function. +test(2183.60, melt(DTid, measure.vars=measure(letter=myfac, value.name, pattern="([ab])([12])")), data.table(id=1, letter=factor(c("a","b")), "2"=c(2,2), "1"=c(NA,1))) +# measure errors. +iris.i <- 1 +iris.num <- datasets::iris[iris.i, 1:4] +iris.days <- data.table( + day1=iris.num, day2=iris.num, Species=iris$Species[iris.i]) +test(2183.61, melt(iris.days, measure.vars=measure(before=as.integer, value.name, dim, sep=".")), error="before conversion function returned vector of all NA", warning=base_messages$coerce_na) +test(2183.62, melt(iris.days, measure.vars=measure(before=function(x)rep(4, length(x)), value.name, dim, sep=".")), error="number of unique groups after applying type conversion functions less than number of groups, change type conversion") +test(2183.63, melt(iris.days, measure.vars=measure(before, value.name, dim, pattern="(day)[12][.](.*)[.](.*)")), error="number of unique column IDs =4 is less than number of melted columns =8; fix by changing pattern/sep") +test(2183.64, melt(iris.days, measure.vars=measure(day=as.integer, value.name, dim, pattern="day(.)[.](.*)[.](.*)")), data.table(Species=factor("setosa"), day=as.integer(c(1,2,1,2)), dim=c("Length","Length","Width","Width"), Sepal=c(5.1,5.1,3.5,3.5), Petal=c(1.4,1.4,0.2,0.2))) +test(2183.65, melt(iris.days, measure.vars=measure(pattern="day")), error="pattern must contain at least one capture group (parenthesized sub-pattern)") +test(2183.66, melt(iris.days, measure.vars=measure(value.name, pattern="(.*)")), error="value.name is the only group; fix by creating at least one more group") +test(2183.67, melt(iris.days, measure.vars=measure(foo, bar, pattern="(foo)(bar)")), error="pattern did not match any cols, so nothing would be melted; fix by changing pattern") +test(2183.68, melt(iris.days, measure.vars=measure(value.name, bar, pattern="(foo)(bar)")), error="pattern did not match any cols, so nothing would be melted; fix by changing pattern") +test(2183.69, melt(data.table(ff=1, ff=2), measure.vars=measure(letter, number, pattern="(.)(.)")), error="measured columns should be uniquely named, problems: ff") +test(2183.70, melt(data.table(f_f=1, f_f=2), measure.vars=measure(letter, number)), error="measured columns should be uniquely named, problems: f_f") +test(2183.71, melt(iris.days, measure.vars=measure(value.name=as.integer, variable, pattern="day(.)[.](.*)")), error="value.name column class=integer after applying conversion function, but must be character") +test(2183.72, melt(data.table(ff=1, ff=2, a=3, b=4), measure.vars=measure(letter, pattern="([ab])"), id.vars="ff"), data.table(ff=1, letter=c("a","b"), value=c(3,4)))#duplicate column names are fine if they are not matched by pattern. +test(2183.73, melt(DTid, measure.vars=measure(letter, multiple.keyword, pattern="([ab])([12])")), error="group names specified in ... conflict with measure argument names; please fix by changing group names: multiple.keyword") +test(2183.74, melt(DTid, measure.vars=measure(letter, number, multiple.keyword=as.integer, pattern="([ab])([12])")), error="multiple.keyword must be a character string") +test(2183.75, melt(DTid, measure.vars=measure(letter, number, multiple.keyword=NA_character_, pattern="([ab])([12])")), error="multiple.keyword must be a character string") +test(2183.76, melt(DTid, measure.vars=measure(letter, number, multiple.keyword="", pattern="([ab])([12])")), error="multiple.keyword must be a character string with nchar>0") +test(2183.77, melt(DTid, measure.vars=measure(letter, cols, pattern="([ab])([12])")), error="group names specified in ... conflict with measure argument names; please fix by changing group names: cols") +test(2183.78, melt(DTid, measure.vars=measure(letter, cols=as.integer, pattern="([ab])([12])")), error="cols must be a character vector of column names") +test(2183.79, melt(DTid, measure.vars=measure(letter, number, pattern=as.integer)), error="pattern must be character string") +test(2183.80, melt(DTid, measure.vars=measure(letter, number, sep=as.integer)), error="sep must be character string") + +# `keyby` allows mixing eval/get with direct columns, #4981 +dt <- data.table(a=c(1,2), b=c(3,4), c=c(1,0)) +dt2 <- dt[,.(suma=sum(a)), keyby=.(b=get("b"),c)] +test(2184.1, dt2[1, suma], 1) +dt2 <- dt[2,.(suma=sum(a)), keyby=.(b=b,c)] +test(2184.2, dt2[1, suma], 2) +dt2 <- dt[2,.(suma=sum(a)), keyby=.(b=get("b"))] +test(2184.3, dt2[1, suma], 2) +dt2 <- dt[2,.(suma=sum(a)), keyby=.(b=get("b"),c)] +test(2184.4, dt2[1, suma], 2) +# #4873 +IDT = as.data.table(iris) +vr = "Species" +IDT[, virginca := get(vr) == "virginica"] +ans = data.table(round = c(3, 3, 3, 2, 2, 4, 2, 4), k = c(6, 7, 8, 5, 7, 7, 6, 8), kar = structure(c(3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("setosa", "versicolor", "virginica"), class = "factor"), N = c(24L, 14L, 4L, 1L, 1L, 1L, 3L, 2L)) +test(2184.5, IDT[(virginca), .N, by = .(round(Sepal.Width), k = round(Sepal.Length), kar = get(vr))] , ans) + +# dcast() segfault or 'STRING_ELT() can only be applied to character not logical' fixed in v1.13.0, #2394 +agg = function(x) if(length(x) > 0) min(x) else NA +DT = data.table(id=c(1,1,2,2), x=c('y','y','y','z'), v=c('a','b','c','d')) +test(2185, dcast(DT, formula=id~x, fun.aggregate=agg, value.var='v'), + data.table(id=c(1,2), y=c('a','c'), z=c(NA,'d'), key="id")) + +# compatible branches might seem incompatible if the condition is global, #4274 +DT = data.table(a=1L) +test(2186, DT[, if (TRUE) .(a=1L) else .(a=1L, b=2L)], DT, + warning='j may not evaluate to the same number of columns for each group') + +# col.names='none' should apply when wrapping too, #4270 +DT = setDT(replicate(getOption('width'), 1, simplify = FALSE)) +test(2187, {print(DT, col.names='none'); TRUE}, notOutput="V") + +# fifelse now supports vector na arguments and coerces NA to other types, PR#4289 +test(2188.01, fifelse(c(TRUE, FALSE, TRUE, NA), 1L, 2L, 1.0), c(1, 2, 1, 1)) +test(2188.02, fifelse(c(TRUE, FALSE, TRUE, NA), 1, 2, 1L), c(1, 2, 1, 1)) +test(2188.03, fifelse(c(TRUE, FALSE, TRUE, NA), 1:4, 11:14, 101:104), c(1L, 12L, 3L, 104L)) +test(2188.04, fifelse(c(TRUE, FALSE, TRUE, NA), NA, 11:14, 101:104), c(NA, 12L, NA, 104L)) +test(2188.05, fifelse(c(TRUE, FALSE, TRUE, NA), 1:4, NA, 101:104), c(1L, NA, 3L, 104L)) +test(2188.06, fifelse(c(TRUE, FALSE, TRUE, NA), 1:4, 11:14, NA), c(1L, 12L, 3L, NA)) +test(2188.07, fifelse(c(TRUE, FALSE, TRUE, NA), 1:4, NA, NA), c(1L, NA, 3L, NA)) +test(2188.08, fifelse(c(TRUE, FALSE, TRUE, NA), NA, NA, NA), c(NA, NA, NA, NA)) +test(2188.09, fifelse(c(TRUE, FALSE, TRUE, NA), NA, NA, NA_character_), rep(NA_character_, 4L)) +test(2188.10, fifelse(c(TRUE, FALSE, TRUE, NA), NA, NA, 101:104), c(NA, NA, NA, 104L)) +test(2188.11, fifelse(c(TRUE, FALSE, TRUE, NA), NA, 11:14, NA), c(NA, 12L, NA, NA)) +test(2188.12, fifelse(c(TRUE, FALSE, TRUE, NA), NA, NA, as.Date("2020-01-01")), as.Date(c(NA, NA, NA, "2020-01-01"))) +test(2188.13, fifelse(TRUE, 1L, 2.0, "a"), error="'na' is of type character but 'no' is double. Please") # smart error message +test(2188.14, fifelse(TRUE, NA, 2, as.Date("2019-07-07")), error="'no' has different class than 'na'. Please") +test(2188.15, fifelse(TRUE, NA, factor('a'), factor('a', levels = c('a','b'))), error="'no' and 'na' are both type factor but their levels are different") +test(2188.16, fifelse(c(NA, NA), 1L, 2L, NULL), c(NA_integer_, NA_integer_)) # NULL `na` is treated as NA + +# rolling join expected output on non-matching join column has been fixed #1913 +DT = data.table(ID=1:5, A=c(1.3, 1.7, 2.4, 0.9, 0.6)) +buckets = data.table(BucketID=1:4, BinA=1:4) +DT[, A.copy := A] +test(2189.1, buckets[DT, on=c("BinA"="A"), roll=-Inf], data.table(BucketID = c(2L, 2L, 3L, 1L, 1L), BinA = c(1.3, 1.7, 2.4, 0.9, 0.6), ID = 1:5, A.copy = c(1.3, 1.7, 2.4, 0.9, 0.6))) +buckets[, BinA := as.numeric(BinA)] +test(2189.2, buckets[DT, on=c("BinA"="A"), roll=-Inf], data.table(BucketID = c(2L, 2L, 3L, 1L, 1L), BinA = c(1.3, 1.7, 2.4, 0.9, 0.6), ID = 1:5, A.copy = c(1.3, 1.7, 2.4, 0.9, 0.6))) + +# segfault subassigning non-list type to list column, #4166 +DT = data.table(a=list(1:2, 3, 4)) +test(2190.1, DT[, a:=1:4], error="Supplied 4 items to be assigned to 3 items of column 'a'.*please use rep") +test(2190.2, DT[1:2, a:=structure(c(1L, 2L), att='t') ]$a, list(structure(1L, att='t'), structure(2L, att='t'), 4)) +test(2190.3, DT[1:2, a:=structure(c(1, 2), att='t') ]$a, list(structure(1, att='t'), structure(2, att='t'), 4)) +test(2190.4, DT[1:2, a:=structure(as.raw(c(1, 2)), att='t') ]$a, list(structure(as.raw(1), att='t'), structure(as.raw(2), att='t'), 4)) +test(2190.5, DT[1:2, a:=structure(as.complex(c(1, 2)), att='t')]$a, list(structure(as.complex(1), att='t'), structure(as.complex(2), att='t'), 4)) +test(2190.61, DT[1:2, a:=structure(c(TRUE, FALSE), att='t') ]$a, list(structure(TRUE, att='t'), structure(FALSE, att='t'), 4)) +test(2190.62, attributes(TRUE), NULL) # ensure R's internal global TRUE/FALSE didn't receive attribute att='t'; discovered when merging #4595 +test(2190.63, attributes(FALSE), NULL) +test(2190.7, DT[1:2, a:=structure(c('a', 'b'), att='t') ]$a, list(structure('a', att='t'), structure('b', att='t'), 4)) +if (test_bit64) { + test(2190.8, DT[1:2, a:=as.integer64(1:2) ]$a, list(as.integer64(1), as.integer64(2), 4)) +} +test(2190.9, DT[1:2, a:=call('sum', 1)], error="type 'language' cannot be coerced to 'list'") +test(2190.91, attributes(TRUE), NULL) # ensure R's internal global TRUE/FALSE didn't receive attribute att='t'; discovered when merging #4595 +test(2190.92, attributes(FALSE), NULL) + +# adding test for (since fixed) 'could not find function "."' when verbose=TRUE, #3196 +DT = data.table(i1 = c(234L, 250L, 169L, 234L, 147L, 96L, 96L, 369L, 147L, 96L), i4 = c(79L, 113L, 270L, -121L, 113L, 113L, -121L, 179L, -228L, 113L), v = 0) +test(2191, DT[1:5, sum(v), by=.(i5 = 1:5 %% 2L), verbose=TRUE], data.table(i5=1:0, V1=c(0,0)), output="gforce") + +# base::as.Date was error when first item blank, affecting as.IDate, #4676 +test(2192.1, as.IDate(c('', '2020-01-01')), structure(c(NA_integer_, 18262L), class=c("IDate","Date"))) +test(2192.2, as.IDate(c('2020-01-01', '')), structure(c(18262L, NA_integer_), class=c("IDate","Date"))) + +if (test_bit64) { + # subassign coerce to integer64 was fixed in 1.12.4, #2530 + DT = data.table(a = as.integer64(1:10)) + DT[a==1, a:=12] + DT[a==2, a:=as.integer64(13)] + test(2193.1, DT, data.table(a = as.integer64(c(12,13,3:10)))) + + # X[Y,,by=.EACHI] when Y contains integer64 also fixed in 1.12.4, #3779 + X = data.table(x=1:3) + Y = data.table(x=1:2, y=as.integer64(c(10,20))) + test(2193.2, X[Y, `:=`(y=i.y), on="x", by=.EACHI], data.table(x=1:3, y=as.integer64(10L,20L,NA))) +} + +# compatibility of endsWith backport with base::endsWith +if (exists('endsWith', 'package:base', inherits=FALSE)) { + DTendsWith = function(x, stub) {n=nchar(x); substr(x, n-nchar(stub)+1L, n)==stub} + BSendsWith = base::endsWith + test(2194.1, DTendsWith('abcd', 'd'), BSendsWith('abcd', 'd')) + test(2194.2, DTendsWith(letters, 'e'), BSendsWith(letters, 'e')) + test(2194.3, DTendsWith(NA_character_, 'a'), BSendsWith(NA_character_, 'a')) + test(2194.4, DTendsWith(character(), 'a'), BSendsWith(character(), 'a')) + # file used in encoding tests + txt = readLines(testDir("issue_563_fread.txt")) + test(2194.5, DTendsWith(txt, 'B'), BSendsWith(txt, 'B')) +} + +# uniqueN(x, by=character()) was internal error, #4594 +DT = data.table(idx=c(1L,2L,1L,3L), value="val") +test(2195.1, uniqueN(DT, by=character(0L)), 3L) +test(2195.2, uniqueN(DT, by=NULL), 3L) +test(2195.3, unique(DT, by=character(0L)), ans<-data.table(idx=1:3, value="val")) +test(2195.4, unique(DT, by=NULL), ans) +test(2195.5, duplicated(DT, by=character(0L)), ans<-c(FALSE, FALSE, TRUE, FALSE)) +test(2195.6, duplicated(DT, by=NULL), ans) +test(2195.7, anyDuplicated(DT, by=character(0L)), 3L) +test(2195.8, anyDuplicated(DT, by=NULL), 3L) + + +# Test new feature %notin%, #4152 +test(2196.1, 11 %notin% 1:10, TRUE) +test(2196.2, "a" %notin% c(), TRUE) +test(2196.3, "a" %notin% c("a", "b", "c"), FALSE) +test(2196.4, c(1, 2) %notin% c(1,2,3), c(FALSE, FALSE)) +test(2196.5, "a" %notin% character(), TRUE) +test(2196.6, "a" %notin% integer(), TRUE) +test(2196.7, "a" %notin% NULL, TRUE) +test(2196.8, NA %notin% 1:5, TRUE) +test(2196.9, NA %notin% c(1:5, NA), FALSE) \ No newline at end of file diff --git a/man/address.Rd b/man/address.Rd index 258c0241f2..8363d3c7ba 100644 --- a/man/address.Rd +++ b/man/address.Rd @@ -16,8 +16,15 @@ Sometimes useful in determining whether a value has been copied or not, programm \value{ A character vector length 1. } +\seealso{ + \code{\link{copy}} +} \references{ -\url{https://stackoverflow.com/a/10913296/403310} (but implemented in C without using \code{.Internal(inspect())}) + \url{https://stackoverflow.com/a/10913296/403310} (but implemented in C without using \code{.Internal(inspect())}) +} +\examples{ +x=1 +address(x) } \keyword{ data } diff --git a/man/assign.Rd b/man/assign.Rd index 5cfc42b9a9..f622755606 100644 --- a/man/assign.Rd +++ b/man/assign.Rd @@ -63,7 +63,7 @@ For additional resources, please read \href{../doc/datatable-faq.html}{\code{vig When \code{LHS} is a factor column and \code{RHS} is a character vector with items missing from the factor levels, the new level(s) are automatically added (by reference, efficiently), unlike base methods. -Unlike \code{<-} for \code{data.frame}, the (potentially large) LHS is not coerced to match the type of the (often small) RHS. Instead the RHS is coerced to match the type of the LHS, if necessary. Where this involves double precision values being coerced to an integer column, a warning is given (whether or not fractional data is truncated). The motivation for this is efficiency. It is best to get the column types correct up front and stick to them. Changing a column type is possible but deliberately harder: provide a whole column as the RHS. This RHS is then \emph{plonked} into that column slot and we call this \emph{plonk syntax}, or \emph{replace column syntax} if you prefer. By needing to construct a full length vector of a new type, you as the user are more aware of what is happening, and it is clearer to readers of your code that you really do intend to change the column type. +Unlike \code{<-} for \code{data.frame}, the (potentially large) LHS is not coerced to match the type of the (often small) RHS. Instead the RHS is coerced to match the type of the LHS, if necessary. Where this involves double precision values being coerced to an integer column, a warning is given when fractional data is truncated. It is best to get the column types correct up front and stick to them. Changing a column type is possible but deliberately harder: provide a whole column as the RHS. This RHS is then \emph{plonked} into that column slot and we call this \emph{plonk syntax}, or \emph{replace column syntax} if you prefer. By needing to construct a full length vector of a new type, you as the user are more aware of what is happening and it is clearer to readers of your code that you really do intend to change the column type; e.g., \code{DT[, colA:=as.integer(colA)]}. A plonk occurs whenever you provide a RHS value to `:=` which is \code{nrow} long. When a column is \emph{plonked}, the original column is not updated by reference because that would entail updating every single element of that column whereas the plonk is just one column pointer update. \code{data.table}s are \emph{not} copied-on-change by \code{:=}, \code{setkey} or any of the other \code{set*} functions. See \code{\link{copy}}. } @@ -72,7 +72,7 @@ Unlike \code{<-} for \code{data.frame}, the (potentially large) LHS is not coerc Since \code{[.data.table} incurs overhead to check the existence and type of arguments (for example), \code{set()} provides direct (but less flexible) assignment by reference with low overhead, appropriate for use inside a \code{for} loop. See examples. \code{:=} is more powerful and flexible than \code{set()} because \code{:=} is intended to be combined with \code{i} and \code{by} in single queries on large datasets. } -\section{Note:}{ +\note{ \code{DT[a > 4, b := c]} is different from \code{DT[a > 4][, b := c]}. The first expression updates (or adds) column \code{b} with the value \code{c} on those rows where \code{a > 4} evaluates to \code{TRUE}. \code{X} is updated \emph{by reference}, therefore no assignment needed. The second expression on the other hand updates a \emph{new} \code{data.table} that's returned by the subset operation. Since the subsetted data.table is ephemeral (it is not assigned to a symbol), the result would be lost; unless the result is assigned, for example, as follows: \code{ans <- DT[a > 4][, b := c]}. diff --git a/man/cdt.Rd b/man/cdt.Rd index ea7c3a76eb..8c0846cac9 100644 --- a/man/cdt.Rd +++ b/man/cdt.Rd @@ -2,18 +2,25 @@ \alias{cdatatable} \title{ data.table exported C routines } \description{ - Note that this interface is going to be changed in next release. Some of internally used C routines are now exported. This interface should be considered experimental. List of exported C routines and their signatures are provided below in the usage section. } \usage{ -# SEXP subsetDT(SEXP x, SEXP rows, SEXP cols); -# p_dtCsubsetDT = R_GetCCallable("data.table", "CsubsetDT"); +# SEXP DT_subsetDT(SEXP x, SEXP rows, SEXP cols); +# p_DT_subsetDT = R_GetCCallable("data.table", "DT_subsetDT"); } \details{ - For details how to use those see \emph{Writing R Extensions} manual \emph{Linking to native routines in other packages} section. + Details how to use those can be found in \emph{Writing R Extensions} manual \emph{Linking to native routines in other packages} section. + An example use with \code{Rcpp}: +\preformatted{ + dt = data.table::as.data.table(iris) + Rcpp::cppFunction("SEXP mysub2(SEXP x, SEXP rows, SEXP cols) { return DT_subsetDT(x,rows,cols); }", + include="#include ", + depends="data.table") + mysub2(dt, 1:4, 1:4) +} } \note{ - Be aware C routines are likely to have less input validation than their corresponding R interface. For example one should not expect \code{DT[-5L]} will be equal to \code{.Call(CsubsetDT, DT, -5L, seq_along(DT))} because translation of \code{i=-5L} to \code{seq_len(nrow(DT))[-5L]} might be happening on R level. Moreover checks that \code{i} argument is in range of \code{1:nrow(DT)}, missingness, etc. might be happening on R level too. + Be aware C routines are likely to have less input validation than their corresponding R interface. For example one should not expect \code{DT[-5L]} will be equal to \code{.Call(DT_subsetDT, DT, -5L, seq_along(DT))} because translation of \code{i=-5L} to \code{seq_len(nrow(DT))[-5L]} might be happening on R level. Moreover checks that \code{i} argument is in range of \code{1:nrow(DT)}, missingness, etc. might be happening on R level too. } \references{ \url{https://cran.r-project.org/doc/manuals/r-release/R-exts.html} diff --git a/man/copy.Rd b/man/copy.Rd index 819fa2a509..587f216805 100644 --- a/man/copy.Rd +++ b/man/copy.Rd @@ -16,11 +16,15 @@ copy(x) \code{data.table} provides functions that operate on objects \emph{by reference} and minimise full object copies as much as possible. Still, it might be necessary in some situations to work on an object's copy which can be done using \code{DT.copy <- copy(DT)}. It may also be sometimes useful before \code{:=} (or \code{set}) is used to subassign to a column by reference. A \code{copy()} may be required when doing \code{dt_names = names(DT)}. Due to R's \emph{copy-on-modify}, \code{dt_names} still points to the same location in memory as \code{names(DT)}. Therefore modifying \code{DT} \emph{by reference} now, say by adding a new column, \code{dt_names} will also get updated. To avoid this, one has to \emph{explicitly} copy: \code{dt_names <- copy(names(DT))}. - } +} +\note{ + To confirm precisely whether an object is a copy of another, compare their exact memory address with \code{\link{address}}. +} \value{ - Returns a copy of the object. + Returns a copy of the object. } -\seealso{ \code{\link{data.table}}, \code{\link{setkey}}, \code{\link{setDT}}, \code{\link{setDF}}, \code{\link{set}} \code{\link{:=}}, \code{\link{setorder}}, \code{\link{setattr}}, \code{\link{setnames}} +\seealso{ + \code{\link{data.table}}, \code{\link{address}}, \code{\link{setkey}}, \code{\link{setDT}}, \code{\link{setDF}}, \code{\link{set}} \code{\link{:=}}, \code{\link{setorder}}, \code{\link{setattr}}, \code{\link{setnames}} } \examples{ # Type 'example(copy)' to run these at prompt and browse output diff --git a/man/data.table.Rd b/man/data.table.Rd index 59b6aae1e1..e934028a3b 100644 --- a/man/data.table.Rd +++ b/man/data.table.Rd @@ -31,7 +31,7 @@ data.table(\dots, keep.rownames=FALSE, check.names=FALSE, key=NULL, stringsAsFac .SDcols, verbose = getOption("datatable.verbose"), # default: FALSE allow.cartesian = getOption("datatable.allow.cartesian"), # default: FALSE - drop = NULL, on = NULL) + drop = NULL, on = NULL, env = NULL) } \arguments{ \item{\dots}{ Just as \code{\dots} in \code{\link{data.frame}}. Usual recycling rules are applied to vectors of different lengths to create a list of equal length vectors.} @@ -110,7 +110,7 @@ data.table(\dots, keep.rownames=FALSE, check.names=FALSE, key=NULL, stringsAsFac \emph{Advanced:} In the \code{X[Y, j]} form of grouping, the \code{j} expression sees variables in \code{X} first, then \code{Y}. We call this \emph{join inherited scope}. If the variable is not in \code{X} or \code{Y} then the calling frame is searched, its calling frame, and so on in the usual way up to and including the global environment.} - \item{keyby}{ Same as \code{by}, but with an additional \code{setkey()} run on the \code{by} columns of the result, for convenience. It is common practice to use `keyby=` routinely when you wish the result to be sorted.} + \item{keyby}{ Same as \code{by}, but with an additional \code{setkey()} run on the \code{by} columns of the result, for convenience. It is common practice to use `keyby=` routinely when you wish the result to be sorted. May also be \code{TRUE} or \code{FALSE} when \code{by} is provided as an alternative way to accomplish the same operation.} \item{with}{ By default \code{with=TRUE} and \code{j} is evaluated within the frame of \code{x}; column names can be used as variables. In case of overlapping variables names inside dataset and in parent scope you can use double dot prefix \code{..cols} to explicitly refer to `\code{cols} variable parent scope and not from your dataset. @@ -170,6 +170,8 @@ data.table(\dots, keep.rownames=FALSE, check.names=FALSE, key=NULL, stringsAsFac } See examples as well as \href{../doc/datatable-secondary-indices-and-auto-indexing.html}{\code{vignette("datatable-secondary-indices-and-auto-indexing")}}. } + + \item{env}{ List or an environment, passed to \code{\link{substitute2}} for substitution of parameters in \code{i}, \code{j} and \code{by} (or \code{keyby}). Use \code{verbose} to preview constructed expressions. } } \details{ \code{data.table} builds on base \R functionality to reduce 2 types of time:\cr @@ -200,6 +202,7 @@ The way to read this out loud is: "Take \code{DT}, subset rows by \code{i}, \emp X[, sum(a), by=c:f] # get sum(a) grouped by all columns in between 'c' and 'f' (both inclusive) X[, sum(a), keyby=b] # get sum(a) grouped by 'b', and sort that result by the grouping column 'b' + X[, sum(a), by=b, keyby=TRUE] # same order as above, but using sorting flag X[, sum(a), by=b][order(b)] # same order as above, but by chaining compound expressions X[c>1, sum(a), by=c] # get rows where c>1 is TRUE, and on those rows, get sum(a) grouped by 'c' X[Y, .(a, b), on="c"] # get rows where Y$c == X$c, and select columns 'X$a' and 'X$b' for those rows @@ -220,11 +223,11 @@ See the \code{see also} section for the several other \emph{methods} that are av } \references{ -\url{https://github.com/Rdatatable/data.table/wiki} (\code{data.table} homepage)\cr +\url{https://r-datatable.com} (\code{data.table} homepage)\cr \url{https://en.wikipedia.org/wiki/Binary_search} } -\note{ If \code{keep.rownames} or \code{check.names} are supplied they must be written in full because \R does not allow partial argument names after `\code{\dots}`. For example, \code{data.table(DF, keep=TRUE)} will create a -column called \code{"keep"} containing \code{TRUE} and this is correct behaviour; \code{data.table(DF, keep.rownames=TRUE)} was intended. +\note{ If \code{keep.rownames} or \code{check.names} are supplied they must be written in full because \R does not allow partial argument names after \code{\dots}. For example, \code{data.table(DF, keep=TRUE)} will create a +column called \code{keep} containing \code{TRUE} and this is correct behaviour; \code{data.table(DF, keep.rownames=TRUE)} was intended. \code{POSIXlt} is not supported as a column type because it uses 40 bytes to store a single datetime. They are implicitly converted to \code{POSIXct} type with \emph{warning}. You may also be interested in \code{\link{IDateTime}} instead; it has methods to convert to and from \code{POSIXlt}. } @@ -280,6 +283,7 @@ DT[["v"]] # same as DT[, v] but much faster # grouping operations - j and by DT[, sum(v), by=x] # ad hoc by, order of groups preserved in result DT[, sum(v), keyby=x] # same, but order the result on by cols +DT[, sum(v), by=x, keyby=TRUE] # same, but using sorting flag DT[, sum(v), by=x][order(x)] # same but by chaining expressions together # fast ad hoc row subsets (subsets as joins) diff --git a/man/dcast.data.table.Rd b/man/dcast.data.table.Rd index daf9fba655..2aa265a96c 100644 --- a/man/dcast.data.table.Rd +++ b/man/dcast.data.table.Rd @@ -61,16 +61,16 @@ Historical note: \code{dcast.data.table} was originally designed as an enhanceme \examples{ ChickWeight = as.data.table(ChickWeight) setnames(ChickWeight, tolower(names(ChickWeight))) -DT <- melt(as.data.table(ChickWeight), id=2:4) # calls melt.data.table +DT <- melt(as.data.table(ChickWeight), id.vars=2:4) # calls melt.data.table # dcast is an S3 method in data.table from v1.9.6 -dcast(DT, time ~ variable, fun=mean) # using partial matching of argument -dcast(DT, diet ~ variable, fun=mean) +dcast(DT, time ~ variable, fun.aggregate=mean) +dcast(DT, diet ~ variable, fun.aggregate=mean) dcast(DT, diet+chick ~ time, drop=FALSE) dcast(DT, diet+chick ~ time, drop=FALSE, fill=0) # using subset -dcast(DT, chick ~ time, fun=mean, subset=.(time < 10 & chick < 20)) +dcast(DT, chick ~ time, fun.aggregate=mean, subset=.(time < 10 & chick < 20)) # drop argument, #1512 DT <- data.table(v1 = c(1.1, 1.1, 1.1, 2.2, 2.2, 2.2), @@ -78,37 +78,37 @@ DT <- data.table(v1 = c(1.1, 1.1, 1.1, 2.2, 2.2, 2.2), v3 = factor(c(2L, 3L, 5L, 1L, 2L, 6L), levels=1:6), v4 = c(3L, 2L, 2L, 5L, 4L, 3L)) # drop=TRUE -dcast(DT, v1 + v2 ~ v3) # default is drop=TRUE -dcast(DT, v1 + v2 ~ v3, drop=FALSE) # all missing combinations of both LHS and RHS -dcast(DT, v1 + v2 ~ v3, drop=c(FALSE, TRUE)) # all missing combinations of only LHS -dcast(DT, v1 + v2 ~ v3, drop=c(TRUE, FALSE)) # all missing combinations of only RHS +dcast(DT, v1+v2~v3, value.var='v4') # default is drop=TRUE +dcast(DT, v1+v2~v3, value.var='v4', drop=FALSE) # all missing combinations of LHS and RHS +dcast(DT, v1+v2~v3, value.var='v4', drop=c(FALSE, TRUE)) # all missing combinations of LHS only +dcast(DT, v1+v2~v3, value.var='v4', drop=c(TRUE, FALSE)) # all missing combinations of RHS only # using . and ... DT <- data.table(v1 = rep(1:2, each = 6), v2 = rep(rep(1:3, 2), each = 2), v3 = rep(1:2, 6), v4 = rnorm(6)) -dcast(DT, \dots ~ v3, value.var = "v4") #same as v1 + v2 ~ v3, value.var = "v4" -dcast(DT, v1 + v2 + v3 ~ ., value.var = "v4") +dcast(DT, \dots ~ v3, value.var="v4") # same as v1+v2 ~ v3, value.var="v4" +dcast(DT, v1+v2+v3 ~ ., value.var="v4") ## for each combination of (v1, v2), add up all values of v4 -dcast(DT, v1 + v2 ~ ., value.var = "v4", fun.aggregate = sum) +dcast(DT, v1+v2 ~ ., value.var="v4", fun.aggregate=sum) # fill and types -dcast(DT, v2 ~ v3, value.var = 'v1', fill = 0L) # 0L --> 0 -dcast(DT, v2 ~ v3, value.var = 'v4', fill = 1.1) # 1.1 --> 1L +dcast(DT, v2~v3, value.var='v1', fun.aggregate=length, fill=0L) # 0L --> 0 +dcast(DT, v2~v3, value.var='v4', fun.aggregate=length, fill=1.1) # 1.1 --> 1L # multiple value.var and multiple fun.aggregate DT = data.table(x=sample(5,20,TRUE), y=sample(2,20,TRUE), - z=sample(letters[1:2], 20,TRUE), d1 = runif(20), d2=1L) + z=sample(letters[1:2], 20,TRUE), d1=runif(20), d2=1L) # multiple value.var -dcast(DT, x + y ~ z, fun=sum, value.var=c("d1","d2")) +dcast(DT, x+y ~ z, fun.aggregate=sum, value.var=c("d1","d2")) # multiple fun.aggregate -dcast(DT, x + y ~ z, fun=list(sum, mean), value.var="d1") +dcast(DT, x+y ~ z, fun.aggregate=list(sum, mean), value.var="d1") # multiple fun.agg and value.var (all combinations) -dcast(DT, x + y ~ z, fun=list(sum, mean), value.var=c("d1", "d2")) +dcast(DT, x+y ~ z, fun.aggregate=list(sum, mean), value.var=c("d1", "d2")) # multiple fun.agg and value.var (one-to-one) -dcast(DT, x + y ~ z, fun=list(sum, mean), value.var=list("d1", "d2")) +dcast(DT, x+y ~ z, fun.aggregate=list(sum, mean), value.var=list("d1", "d2")) } \seealso{ \code{\link{melt.data.table}}, \code{\link{rowid}}, \url{https://cran.r-project.org/package=reshape} diff --git a/man/deprecated.Rd b/man/deprecated.Rd index c1bb9afc16..da138d8734 100644 --- a/man/deprecated.Rd +++ b/man/deprecated.Rd @@ -8,6 +8,9 @@ \usage{ key(x) <- value # warning since 2012; DEPRECATED since Mar 2019 } +\examples{ +# dummy example section to pass release check that all .Rd files have examples +} \arguments{ \item{x}{ Deprecated. } } diff --git a/man/fcase.Rd b/man/fcase.Rd index 82e582ca43..dd3a119110 100644 --- a/man/fcase.Rd +++ b/man/fcase.Rd @@ -5,7 +5,7 @@ \code{fcase} is a fast implementation of SQL \code{CASE WHEN} statement for R. Conceptually, \code{fcase} is a nested version of \code{\link{fifelse}} (with smarter implementation than manual nesting). It is comparable to \code{dplyr::case_when} and supports \code{bit64}'s \code{integer64} and \code{nanotime} classes. } \usage{ - fcase(..., default=NA) + fcase(\dots, default=NA) } \arguments{ \item{...}{ A sequence consisting of logical condition (\code{when})-resulting value (\code{value}) \emph{pairs} in the following order \code{when1, value1, when2, value2, ..., whenN, valueN}. Logical conditions \code{when1, when2, ..., whenN} must all have the same length, type and attributes. Each \code{value} may either share length with \code{when} or be length 1. Please see Examples section for further details.} diff --git a/man/fifelse.Rd b/man/fifelse.Rd index 2fe355c98c..4165dd796d 100644 --- a/man/fifelse.Rd +++ b/man/fifelse.Rd @@ -11,10 +11,10 @@ \arguments{ \item{test}{ A logical vector. } \item{yes, no}{ Values to return depending on \code{TRUE}/\code{FALSE} element of \code{test}. They must be the same type and be either length \code{1} or the same length of \code{test}. } - \item{na}{ Value to return if an element of \code{test} is \code{NA}. It must be the same type as \code{yes} and \code{no} and length \code{1}. Default value \code{NA}. \code{NULL} is treated as \code{NA}. } + \item{na}{ Value to return if an element of \code{test} is \code{NA}. It must be the same type as \code{yes} and \code{no} and its length must be either \code{1} or the same length of \code{test}. Default value \code{NA}. \code{NULL} is treated as \code{NA}. } } \details{ -In contrast to \code{\link[base]{ifelse}} attributes are copied from \code{yes} to the output. This is useful when returning \code{Date}, \code{factor} or other classes. +In contrast to \code{\link[base]{ifelse}} attributes are copied from the first non-\code{NA} argument to the output. This is useful when returning \code{Date}, \code{factor} or other classes. } \value{ A vector of the same length as \code{test} and attributes as \code{yes}. Data values are taken from the values of \code{yes} and \code{no}, eventually \code{na}. diff --git a/man/fread.Rd b/man/fread.Rd index 703eb70d3e..c7b7da8566 100644 --- a/man/fread.Rd +++ b/man/fread.Rd @@ -37,7 +37,8 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC" \item{nrows}{ The maximum number of rows to read. Unlike \code{read.table}, you do not need to set this to an estimate of the number of rows in the file for better speed because that is already automatically determined by \code{fread} almost instantly using the large sample of lines. \code{nrows=0} returns the column names and typed empty columns determined by the large sample; useful for a dry run of a large file or to quickly check format consistency of a set of files before starting to read any of them. } \item{header}{ Does the first data line contain column names? Defaults according to whether every non-empty field on the first data line is type character. If so, or TRUE is supplied, any empty column names are given a default name. } \item{na.strings}{ A character vector of strings which are to be interpreted as \code{NA} values. By default, \code{",,"} for columns of all types, including type \code{character} is read as \code{NA} for consistency. \code{,"",} is unambiguous and read as an empty string. To read \code{,NA,} as \code{NA}, set \code{na.strings="NA"}. To read \code{,,} as blank string \code{""}, set \code{na.strings=NULL}. When they occur in the file, the strings in \code{na.strings} should not appear quoted since that is how the string literal \code{,"NA",} is distinguished from \code{,NA,}, for example, when \code{na.strings="NA"}. } - \item{stringsAsFactors}{ Convert all character columns to factors? } + \item{stringsAsFactors}{ Convert all or some character columns to factors? Acceptable inputs are \code{TRUE}, \code{FALSE}, or a decimal value between 0.0 and 1.0. For \code{stringsAsFactors = FALSE}, all string columns are stored as \code{character} vs. all stored as \code{factor} when \code{TRUE}. When \code{stringsAsFactors = p} for \code{0 <= p <= 1}, string columns \code{col} are stored as \code{factor} if \code{uniqueN(col)/nrow < p}. + } \item{verbose}{ Be chatty and report timings? } \item{skip}{ If 0 (default) start on the first line and from there finds the first row with a consistent number of columns. This automatically avoids irregular header information before the column names row. \code{skip>0} means ignore the first \code{skip} rows manually. \code{skip="string"} searches for \code{"string"} in the file (e.g. a substring of the column names row) and starts on that line (inspired by read.xls in package gdata). } \item{select}{ A vector of column names or numbers to keep, drop the rest. \code{select} may specify types too in the same way as \code{colClasses}; i.e., a vector of \code{colname=type} pairs, or a \code{list} of \code{type=col(s)} pairs. In all forms of \code{select}, the order that the columns are specified determines the order of the columns in the result. } diff --git a/man/froll.Rd b/man/froll.Rd index 388c47c485..090b397a90 100644 --- a/man/froll.Rd +++ b/man/froll.Rd @@ -12,71 +12,65 @@ \alias{frollapply} \title{Rolling functions} \description{ - Fast rolling functions to calculate aggregates on sliding window. Function name and arguments are experimental. + Fast rolling functions to calculate aggregates on sliding windows. Function name and arguments are experimental. } \usage{ -frollmean(x, n, fill=NA, algo=c("fast", "exact"), align=c("right", - "left", "center"), na.rm=FALSE, hasNA=NA, adaptive=FALSE) -frollsum(x, n, fill=NA, algo=c("fast","exact"), align=c("right", "left", - "center"), na.rm=FALSE, hasNA=NA, adaptive=FALSE) +frollmean(x, n, fill=NA, algo=c("fast", "exact"), + align=c("right", "left", "center"), na.rm=FALSE, hasNA=NA, adaptive=FALSE) +frollsum(x, n, fill=NA, algo=c("fast","exact"), + align=c("right", "left", "center"), na.rm=FALSE, hasNA=NA, adaptive=FALSE) frollapply(x, n, FUN, \dots, fill=NA, align=c("right", "left", "center")) } \arguments{ - \item{x}{ vector, list, data.frame or data.table of numeric or logical columns. } - \item{n}{ integer vector, for adaptive rolling function also list of - integer vectors, rolling window size. } - \item{fill}{ numeric or logical, value to pad by. Defaults to \code{NA}. } - \item{algo}{ character, default \code{"fast"}. When set to \code{"exact"}, - then slower algorithm is used. It suffers less from floating point - rounding error, performs extra pass to adjust rounding error - correction and carefully handles all non-finite values. If available - it will use multiple cores. See details for more information. } - \item{align}{ character, define if rolling window covers preceding rows - (\code{"right"}), following rows (\code{"left"}) or centered - (\code{"center"}). Defaults to \code{"right"}. } - \item{na.rm}{ logical. Should missing values be removed when - calculating window? Defaults to \code{FALSE}. For details on handling - other non-finite values, see details below. } - \item{hasNA}{ logical. If it is known that \code{x} contains \code{NA} - then setting to \code{TRUE} will speed up. Defaults to \code{NA}. } - \item{adaptive}{ logical, should adaptive rolling function be - calculated, default \code{FALSE}. See details below. } - \item{FUN}{ the function to be applied in rolling fashion; see Details for restrictions } - \item{\dots}{ extra arguments passed to \code{FUN} in \code{frollapply}. } + \item{x}{ Vector, \code{data.frame} or \code{data.table} of integer, numeric or logical columns over which to calculate the windowed aggregations. May also be a list, in which case the rolling function is applied to each of its elements. } + \item{n}{ Integer vector giving rolling window size(s). This is the \emph{total} number of included values. Adaptive rolling functions also accept a list of integer vectors. } + \item{fill}{ Numeric; value to pad by. Defaults to \code{NA}. } + \item{algo}{ Character, default \code{"fast"}. When set to \code{"exact"}, a slower (but more accurate) algorithm is used. It + suffers less from floating point rounding errors by performing an extra pass, and carefully handles all non-finite values. + It will use mutiple cores where available. See Details for more information. } + \item{align}{ Character, specifying the "alignment" of the rolling window, defaulting to \code{"right"}. \code{"right"} covers preceding rows (the window \emph{ends} on the current value); \code{"left"} covers following rows (the window \emph{starts} on the current value); \code{"center"} is halfway in between (the window is \emph{centered} on the current value, biased towards \code{"left"} when \code{n} is even). } + \item{na.rm}{ Logical, default \code{FALSE}. Should missing values be removed when + calculating window? For details on handling other non-finite values, see Details. } + \item{hasNA}{ Logical. If it is known that \code{x} contains \code{NA} + then setting this to \code{TRUE} will speed up calculation. Defaults to \code{NA}. } + \item{adaptive}{ Logical, default \code{FALSE}. Should the rolling function be calculated adaptively? See Details below. } + \item{FUN}{ The function to be applied to the rolling window; see Details for restrictions. } + \item{\dots}{ Extra arguments passed to \code{FUN} in \code{frollapply}. } } \details{ - \code{froll*} functions accepts vectors, lists, data.frames or - data.tables. They always return a list except when the input is a - \code{vector} and \code{length(n)==1} in which case a \code{vector} - is returned, for convenience. Thus rolling functions can be used - conveniently within data.table syntax. + \code{froll*} functions accept vectors, lists, \code{data.frame}s or + \code{data.table}s. They always return a list except when the input is a + \code{vector} and \code{length(n)==1}, in which case a \code{vector} + is returned, for convenience. Thus, rolling functions can be used + conveniently within \code{data.table} syntax. Argument \code{n} allows multiple values to apply rolling functions on - multiple window sizes. If \code{adaptive=TRUE}, then it expects a list. + multiple window sizes. If \code{adaptive=TRUE}, then \code{n} must be a list. Each list element must be integer vector of window sizes corresponding - to every single observation in each column. + to every single observation in each column; see Examples. - When \code{algo="fast"} then \emph{on-line} algorithm is used, also - any \code{NaN, +Inf, -Inf} is treated as \code{NA}. - Setting \code{algo="exact"} will make rolling functions to use - compute-intensive algorithm that suffers less from floating point - rounding error. It also handles \code{NaN, +Inf, -Inf} consistently to + When \code{algo="fast"} an \emph{"on-line"} algorithm is used, and + all of \code{NaN, +Inf, -Inf} are treated as \code{NA}. + Setting \code{algo="exact"} will make rolling functions to use a more + computationally-intensive algorithm that suffers less from floating point + rounding error (the same consideration applies to \code{\link[base]{mean}}). + \code{algo="exact"} also handles \code{NaN, +Inf, -Inf} consistently to base R. In case of some functions (like \emph{mean}), it will additionally make extra pass to perform floating point error correction. Error corrections might not be truly exact on some platforms (like Windows) when using multiple threads. - Adaptive rolling functions are special cases where for each single - observation has own corresponding rolling window width. Due to the logic - of adaptive rolling functions, following restrictions apply: + Adaptive rolling functions are a special case where each + observation has its own corresponding rolling window width. Due to the logic + of adaptive rolling functions, the following restrictions apply: \itemize{ \item{ \code{align} only \code{"right"}. } \item{ if list of vectors is passed to \code{x}, then all - list vectors must have equal length. } + vectors within it must have equal length. } } When multiple columns or multiple windows width are provided, then they - are run in parallel. Except for the \code{algo="exact"} which runs in + are run in parallel. The exception is for \code{algo="exact"}, which runs in parallel already. \code{frollapply} computes rolling aggregate on arbitrary R functions. @@ -113,7 +107,7 @@ frollapply(x, n, FUN, \dots, fill=NA, align=c("right", "left", "center")) \item{ when \code{adaptive=TRUE}, then \code{n} must be vector of length equal to \code{nrow(x)}, or list of such vectors. } \item{ \code{partial} window feature is not supported, although it can - be accomplished by using \code{adaptive=TRUE}, see examples. } + be accomplished by using \code{adaptive=TRUE}, see examples. \code{NA} is always returned for incomplete windows. } } Be aware that rolling functions operates on the physical order of input. diff --git a/man/fwrite.Rd b/man/fwrite.Rd index f784b6bc3b..870acaac75 100644 --- a/man/fwrite.Rd +++ b/man/fwrite.Rd @@ -6,7 +6,8 @@ As \code{write.csv} but much faster (e.g. 2 seconds versus 1 minute) and just as } \usage{ fwrite(x, file = "", append = FALSE, quote = "auto", - sep = ",", sep2 = c("","|",""), + sep=getOption("datatable.fwrite.sep", ","), + sep2 = c("","|",""), eol = if (.Platform$OS.type=="windows") "\r\n" else "\n", na = "", dec = ".", row.names = FALSE, col.names = TRUE, qmethod = c("double","escape"), @@ -19,7 +20,8 @@ fwrite(x, file = "", append = FALSE, quote = "auto", compress = c("auto", "none", "gzip"), yaml = FALSE, bom = FALSE, - verbose = getOption("datatable.verbose", FALSE)) + verbose = getOption("datatable.verbose", FALSE), + encoding = "") } \arguments{ \item{x}{Any \code{list} of same length vectors; e.g. \code{data.frame} and \code{data.table}. If \code{matrix}, it gets internally coerced to \code{data.table} preserving col names but not row names} @@ -59,6 +61,7 @@ fwrite(x, file = "", append = FALSE, quote = "auto", \item{yaml}{If \code{TRUE}, \code{fwrite} will output a CSVY file, that is, a CSV file with metadata stored as a YAML header, using \code{\link[yaml]{as.yaml}}. See \code{Details}. } \item{bom}{If \code{TRUE} a BOM (Byte Order Mark) sequence (EF BB BF) is added at the beginning of the file; format 'UTF-8 with BOM'.} \item{verbose}{Be chatty and report timings?} + \item{encoding}{ The encoding of the strings written to the CSV file. Default is \code{""}, which means writting raw bytes without considering the encoding. Other possible options are \code{"UTF-8"} and \code{"native"}. } } \details{ \code{fwrite} began as a community contribution with \href{https://github.com/Rdatatable/data.table/pull/1613}{pull request #1613} by Otto Seiskari. This gave Matt Dowle the impetus to specialize the numeric formatting and to parallelize: \url{https://www.h2o.ai/blog/fast-csv-writing-for-r/}. Final items were tracked in \href{https://github.com/Rdatatable/data.table/issues/1664}{issue #1664} such as automatic quoting, \code{bit64::integer64} support, decimal/scientific formatting exactly matching \code{write.csv} between 2.225074e-308 and 1.797693e+308 to 15 significant figures, \code{row.names}, dates (between 0000-03-01 and 9999-12-31), times and \code{sep2} for \code{list} columns where each cell can itself be a vector. diff --git a/man/measure.Rd b/man/measure.Rd new file mode 100644 index 0000000000..73a315e006 --- /dev/null +++ b/man/measure.Rd @@ -0,0 +1,92 @@ +\name{measure} +\alias{measure} +\alias{measurev} +\title{Specify measure.vars via regex or separator} +\description{ + These functions compute an integer vector or list for use as + the \code{measure.vars} argument to \code{melt}. + Each measured variable name is converted into several groups that occupy + different columns in the output melted data. + \code{measure} allows specifying group names/conversions in R code + (each group and conversion specified as an argument) + whereas \code{measurev} allows specifying group names/conversions using + data values + (each group and conversion specified as a list element). + See + \href{../doc/datatable-reshape.html}{\code{vignette("datatable-reshape")}} + for more info. +} +\usage{ +measure(\dots, sep, pattern, cols, multiple.keyword="value.name") +measurev(fun.list, sep, pattern, cols, multiple.keyword="value.name", + group.desc="elements of fun.list") +} +\arguments{ + \item{\dots}{One or more (1) symbols (without argument name; symbol + is used for group name) or (2) functions to convert the groups + (with argument name that is used for group name). + Must have same number of arguments as groups that are + specified by either \code{sep} or \code{pattern} arguments.} + \item{fun.list}{Named list which must have the same number of + elements as groups that are specified by either \code{sep} or + \code{pattern} arguments. Each name used for a group + name, and each value must be either a function + (to convert the group from a character vector to an atomic vector of the + same size) or NULL (no conversion).} + \item{sep}{Separator to split each element of \code{cols} into + groups. Columns that result in the maximum number of groups + are considered measure variables.} + \item{pattern}{Perl-compatible regex with capture groups to match to + \code{cols}. Columns that match the regex are considered measure variables.} + \item{cols}{A character vector of column names.} + \item{multiple.keyword}{A string, if used as a group name, then + measure returns a list and melt returns multiple + value columns (with names defined by the unique values in that + group). Otherwise if the string not used as a group name, then + measure returns a vector and melt returns a single value column.} + \item{group.desc}{Internal, used in error messages.} +} +\seealso{ + \code{\link{melt}}, + \url{https://github.com/Rdatatable/data.table/wiki/Getting-started} +} +\examples{ +(two.iris = data.table(datasets::iris)[c(1,150)]) +# melt into a single value column. +melt(two.iris, measure.vars = measure(part, dim, sep=".")) +# do the same, programmatically with measurev +my.list = list(part=NULL, dim=NULL) +melt(two.iris, measure.vars=measurev(my.list, sep=".")) +# melt into two value columns, one for each part. +melt(two.iris, measure.vars = measure(value.name, dim, sep=".")) +# melt into two value columns, one for each dim. +melt(two.iris, measure.vars = measure(part, value.name, sep=".")) +# melt using sep, converting child number to integer. +(two.families = data.table(sex_child1="M", sex_child2="F", age_child1=10, age_child2=20)) +print(melt(two.families, measure.vars = measure( + value.name, child=as.integer, + sep="_child" +)), class=TRUE) +# same melt using pattern. +print(melt(two.families, measure.vars = measure( + value.name, child=as.integer, + pattern="(.*)_child(.)" +)), class=TRUE) +# same melt with pattern and measurev function list. +print(melt(two.families, measure.vars = measurev( + list(value.name=NULL, child=as.integer), + pattern="(.*)_child(.)" +)), class=TRUE) +# inspired by data(who, package="tidyr") +(who <- data.table(id=1, new_sp_m5564=2, newrel_f65=3)) +# melt to three variable columns, all character. +melt(who, measure.vars = measure(diagnosis, gender, ages, pattern="new_?(.*)_(.)(.*)")) +# melt to five variable columns, two numeric (with custom conversion). +print(melt(who, measure.vars = measure( + diagnosis, gender, ages, + ymin=as.numeric, + ymax=function(y)ifelse(y=="", Inf, as.numeric(y)), + pattern="new_?(.*)_(.)(([0-9]{2})([0-9]{0,2}))" +)), class=TRUE) +} +\keyword{data} diff --git a/man/melt.data.table.Rd b/man/melt.data.table.Rd index e56a10e4e1..ddca733fe8 100644 --- a/man/melt.data.table.Rd +++ b/man/melt.data.table.Rd @@ -31,7 +31,7 @@ non-measure columns will be assigned to it. If integer, must be positive; see De } For convenience/clarity in the case of multiple \code{melt}ed columns, resulting column names can be supplied as names to the elements \code{measure.vars} (in the \code{list} and \code{patterns} usages). See also \code{Examples}. } -\item{variable.name}{name for the measured variable names column. The default name is \code{'variable'}.} +\item{variable.name}{name (default \code{'variable'}) of output column containing information about which input column(s) were melted. If \code{measure.vars} is an integer/character vector, then each entry of this column contains the name of a melted column from \code{data}. If \code{measure.vars} is a list of integer/character vectors, then each entry of this column contains an integer indicating an index/position in each of those vectors. If \code{measure.vars} has attribute \code{variable_table} then it must be a data table with nrow = length of \code{measure.vars} vector(s), each row describing the corresponding measured variables(s), (typically created via \code{measure}) and its columns will be output instead of the \code{variable.name} column.} \item{value.name}{name for the molten data values column(s). The default name is \code{'value'}. Multiple names can be provided here for the case when \code{measure.vars} is a \code{list}, though note well that the names provided in \code{measure.vars} take precedence. } \item{na.rm}{If \code{TRUE}, \code{NA} values will be removed from the molten data.} @@ -64,7 +64,11 @@ effect. From version \code{1.9.6}, \code{melt} gains a feature with \code{measure.vars} accepting a list of \code{character} or \code{integer} vectors as well to melt -into multiple columns in a single function call efficiently. The function +into multiple columns in a single function call efficiently. +If a vector in the list contains missing values, or is shorter than the +max length of the list elements, then the output will include runs of +missing values at the specified position, or at the end. +The function \code{\link{patterns}} can be used to provide regular expression patterns. When used along with \code{melt}, if \code{cols} argument is not provided, the patterns will be matched against \code{names(data)}, for convenience. @@ -87,53 +91,68 @@ An unkeyed \code{data.table} containing the molten data. set.seed(45) require(data.table) DT <- data.table( - i_1 = c(1:5, NA), - i_2 = c(NA,6,7,8,9,10), - f_1 = factor(sample(c(letters[1:3], NA), 6, TRUE)), - f_2 = factor(c("z", "a", "x", "c", "x", "x"), ordered=TRUE), - c_1 = sample(c(letters[1:3], NA), 6, TRUE), - d_1 = as.Date(c(1:3,NA,4:5), origin="2013-09-01"), - d_2 = as.Date(6:1, origin="2012-01-01")) + i_1 = c(1:5, NA), + n_1 = c(NA, 6, 7, 8, 9, 10), + f_1 = factor(sample(c(letters[1:3], NA), 6L, TRUE)), + f_2 = factor(c("z", "a", "x", "c", "x", "x"), ordered=TRUE), + c_1 = sample(c(letters[1:3], NA), 6L, TRUE), + c_2 = sample(c(LETTERS[1:2], NA), 6L, TRUE), + d_1 = as.Date(c(1:3,NA,4:5), origin="2013-09-01"), + d_2 = as.Date(6:1, origin="2012-01-01") +) # add a couple of list cols -DT[, l_1 := DT[, list(c=list(rep(i_1, sample(5,1)))), by = i_1]$c] -DT[, l_2 := DT[, list(c=list(rep(c_1, sample(5,1)))), by = i_1]$c] +DT[, l_1 := DT[, list(c=list(rep(i_1, sample(5, 1L)))), by = i_1]$c] +DT[, l_2 := DT[, list(c=list(rep(c_1, sample(5, 1L)))), by = i_1]$c] -# id, measure as character/integer/numeric vectors -melt(DT, id=1:2, measure="f_1") -melt(DT, id=c("i_1", "i_2"), measure=3) # same as above -melt(DT, id=1:2, measure=3L, value.factor=TRUE) # same, but 'value' is factor -melt(DT, id=1:2, measure=3:4, value.factor=TRUE) # 'value' is *ordered* factor +# id.vars, measure.vars as character/integer/numeric vectors +melt(DT, id.vars=1:2, measure.vars="f_1") +melt(DT, id.vars=c("i_1", "n_1"), measure.vars=3) # same as above +melt(DT, id.vars=1:2, measure.vars=3L, value.factor=TRUE) # same, but 'value' is factor +melt(DT, id.vars=1:2, measure.vars=3:4, value.factor=TRUE) # 'value' is *ordered* factor # preserves attribute when types are identical, ex: Date -melt(DT, id=3:4, measure=c("d_1", "d_2")) -melt(DT, id=3:4, measure=c("i_1", "d_1")) # attribute not preserved +melt(DT, id.vars=3:4, measure.vars=c("d_1", "d_2")) +melt(DT, id.vars=3:4, measure.vars=c("n_1", "d_1")) # attribute not preserved # on list -melt(DT, id=1, measure=c("l_1", "l_2")) # value is a list -melt(DT, id=1, measure=c("c_1", "l_1")) # c1 coerced to list +melt(DT, id.vars=1, measure.vars=c("l_1", "l_2")) # value is a list +suppressWarnings( + melt(DT, id.vars=1, measure.vars=c("c_1", "l_1")) # c1 coerced to list, with warning +) # on character -melt(DT, id=1, measure=c("c_1", "f_1")) # value is char -melt(DT, id=1, measure=c("c_1", "i_2")) # i2 coerced to char +melt(DT, id.vars=1, measure.vars=c("c_1", "f_1")) # value is char +suppressWarnings( + melt(DT, id.vars=1, measure.vars=c("c_1", "n_1")) # n_1 coerced to char, with warning +) # on na.rm=TRUE. NAs are removed efficiently, from within C -melt(DT, id=1, measure=c("c_1", "i_2"), na.rm=TRUE) # remove NA +melt(DT, id.vars=1, measure.vars=c("c_1", "c_2"), na.rm=TRUE) # remove NA # measure.vars can be also a list # melt "f_1,f_2" and "d_1,d_2" simultaneously, retain 'factor' attribute # convenient way using internal function patterns() -melt(DT, id=1:2, measure=patterns("^f_", "^d_"), value.factor=TRUE) +melt(DT, id.vars=1:2, measure.vars=patterns("^f_", "^d_"), value.factor=TRUE) # same as above, but provide list of columns directly by column names or indices -melt(DT, id=1:2, measure=list(3:4, c("d_1", "d_2")), value.factor=TRUE) +melt(DT, id.vars=1:2, measure.vars=list(3:4, c("d_1", "d_2")), value.factor=TRUE) # same as above, but provide names directly: -melt(DT, id=1:2, measure=patterns(f="^f_", d="^d_"), value.factor=TRUE) +melt(DT, id.vars=1:2, measure.vars=patterns(f="^f_", d="^d_"), value.factor=TRUE) # na.rm=TRUE removes rows with NAs in any 'value' columns -melt(DT, id=1:2, measure=patterns("f_", "d_"), value.factor=TRUE, na.rm=TRUE) +melt(DT, id.vars=1:2, measure.vars=patterns("f_", "d_"), value.factor=TRUE, na.rm=TRUE) # return 'NA' for missing columns, 'na.rm=TRUE' ignored due to list column -melt(DT, id=1:2, measure=patterns("l_", "c_"), na.rm=TRUE) +melt(DT, id.vars=1:2, measure.vars=patterns("l_", "c_"), na.rm=TRUE) +# measure list with missing/short entries results in output with runs of NA +DT.missing.cols <- DT[, .(d_1, d_2, c_1, f_2)] +melt(DT.missing.cols, measure.vars=list(d=1:2, c="c_1", f=c(NA, "f_2"))) + +# specifying columns to melt via separator. +melt(DT.missing.cols, measure.vars=measure(value.name, number=as.integer, sep="_")) + +# specifying columns to melt via regex. +melt(DT.missing.cols, measure.vars=measure(value.name, number=as.integer, pattern="(.)_(.)")) } \seealso{ \code{\link{dcast}}, \url{https://cran.r-project.org/package=reshape} diff --git a/man/openmp-utils.Rd b/man/openmp-utils.Rd index b8d014976e..71e469ed72 100644 --- a/man/openmp-utils.Rd +++ b/man/openmp-utils.Rd @@ -5,7 +5,7 @@ \alias{openmp} \title{ Set or get number of threads that data.table should use } \description{ - Set and get number of threads to be used in \code{data.table} functions that are parallelized with OpenMP. The number of threads is initialized when \code{data.table} is first loaded in the R session using optional envioronment variables. Thereafter, the number of threads may be changed by calling \code{setDTthreads}. If you change an environment variable using \code{Sys.setenv} you will need to call \code{setDTthreads} again to reread the environment variables. + Set and get number of threads to be used in \code{data.table} functions that are parallelized with OpenMP. The number of threads is initialized when \code{data.table} is first loaded in the R session using optional environment variables. Thereafter, the number of threads may be changed by calling \code{setDTthreads}. If you change an environment variable using \code{Sys.setenv} you will need to call \code{setDTthreads} again to reread the environment variables. } \usage{ setDTthreads(threads = NULL, restore_after_fork = NULL, percent = NULL, throttle = NULL) @@ -51,4 +51,7 @@ \item{\file{types.c} - Internal testing usage} } } +\examples{ + getDTthreads(verbose=TRUE) +} \keyword{ data } diff --git a/man/shouldPrint.Rd b/man/shouldPrint.Rd index 80851f53d8..b3e1bcdc9b 100644 --- a/man/shouldPrint.Rd +++ b/man/shouldPrint.Rd @@ -21,5 +21,7 @@ \url{https://github.com/IRkernel/IRkernel/issues/127}\cr \url{https://github.com/Rdatatable/data.table/issues/933}\cr } - +\examples{ +# dummy example section to pass release check that all .Rd files have examples +} diff --git a/man/special-symbols.Rd b/man/special-symbols.Rd index 30cfedc5fa..9bfa72fceb 100644 --- a/man/special-symbols.Rd +++ b/man/special-symbols.Rd @@ -10,7 +10,7 @@ \alias{.NGRP} \title{ Special symbols } \description{ - \code{.SD}, \code{.BY}, \code{.N}, \code{.I}, \code{.GRP}, and \code{.NGRP} are \emph{read-only} symbols for use in \code{j}. \code{.N} can be used in \code{i} as well. See the vignettes and examples here and in \code{\link{data.table}}. + \code{.SD}, \code{.BY}, \code{.N}, \code{.I}, \code{.GRP}, and \code{.NGRP} are \emph{read-only} symbols for use in \code{j}. \code{.N} can be used in \code{i} as well. See the vignettes, Details and Examples here and in \code{\link{data.table}}. \code{.EACHI} is a symbol passed to \code{by}; i.e. \code{by=.EACHI}. } \details{ @@ -28,6 +28,8 @@ } \code{.EACHI} is defined as \code{NULL} but its value is not used. Its usage is \code{by=.EACHI} (or \code{keyby=.EACHI}) which invokes grouping-by-each-row-of-i; see \code{\link{data.table}}'s \code{by} argument for more details. + + Note that \code{.N} in \code{i} is computed up-front, while that in \code{j} applies \emph{after filtering in \code{i}}. That means that even absent grouping, \code{.N} in \code{i} can be different from \code{.N} in \code{j}. See Examples. } \seealso{ \code{\link{data.table}}, \code{\link{:=}}, \code{\link{set}}, \code{\link{datatable-optimize}} @@ -52,5 +54,9 @@ DT[, c(.(y=max(y)), lapply(.SD, min)), DT[, grp := .GRP, by=x] # add a group counter DT[, grp_pct := .GRP/.NGRP, by=x] # add a group "progress" counter X[, DT[.BY, y, on="x"], by=x] # join within each group + +# .N can be different in i and j +DT[{cat(sprintf('in i, .N is \%d\n', .N)); a < .N/2}, + {cat(sprintf('in j, .N is \%d\n', .N)); mean(a)}] } \keyword{ data } diff --git a/man/substitute2.Rd b/man/substitute2.Rd new file mode 100644 index 0000000000..3b8d536141 --- /dev/null +++ b/man/substitute2.Rd @@ -0,0 +1,77 @@ +\name{substitute2} +\alias{substitute2} +\alias{substitute} +\alias{I} +\title{ Substitute expression } +\description{ + Experimental, more robust, and more user-friendly version of base R \code{\link[base]{substitute}}. +} +\usage{ + substitute2(expr, env) +} +\arguments{ + \item{expr}{ Unevaluated expression in which substitution has to take place. } + \item{env}{ List, or an environment that will be coerced to list, from which variables will be taken to inject into \code{expr}. } +} +\details{ + For convenience function will turn any character elements of \code{env} argument into symbols. In case if character is of length 2 or more, it will raise an error. It will also turn any list elements into list calls instead. Behaviour can be changed by wrapping \code{env} into \code{\link[base]{I}} call. In such case any symbols must be explicitly created, for example using \code{as.name} function. Alternatively it is possible to wrap particular elements of \code{env} into \code{\link[base]{I}} call, then only those elements will retain their original class. + + Comparing to base R \code{\link[base]{substitute}}, \code{substitute2} function: +\enumerate{ + \item substitutes calls argument names as well + \item by default converts character elements of \code{env} argument to symbols + \item by default converts list elements of \code{env} argument to list calls + \item does not accept missing \code{env} argument + \item evaluates elements of \code{env} argument +} +} +\note{ + Conversion of \emph{character to symbol} and \emph{list to list call} works recursively for each list element in \code{env} list. If this behaviour is not desired for your use case, we would like to hear about that via our issue tracker. For the present moment there is an option to disable that: \code{options(datatable.enlist=FALSE)}. This option is provided only for debugging and will be removed in future. Please do not write code that depends on it, but use \code{\link[base]{I}} calls instead. +} +\value{ + Quoted expression having variables and call argument names substituted. +} +\seealso{ \code{\link[base]{substitute}}, \code{\link[base]{I}}, \code{\link[base]{call}}, \code{\link[base]{name}}, \code{\link[base]{eval}} } +\examples{ +## base R substitute vs substitute2 +substitute(list(var1 = var2), list(var1 = "c1", var2 = 5L)) +substitute2(list(var1 = var2), list(var1 = "c1", var2 = 5L)) ## works also on names + +substitute(var1, list(var1 = "c1")) +substitute2(var1, list(var1 = I("c1"))) ## enforce character with I + +substitute(var1, list(var1 = as.name("c1"))) +substitute2(var1, list(var1 = "c1")) ## turn character into symbol, for convenience + +## mix symbols and characters using 'I' function, both lines will yield same result +substitute2(list(var1 = var2), list(var1 = "c1", var2 = I("some_character"))) +substitute2(list(var1 = var2), I(list(var1 = as.name("c1"), var2 = "some_character"))) + +## list elements are enlist'ed into list calls +(cl1 = substitute(f(lst), list(lst = list(1L, 2L)))) +(cl2 = substitute2(f(lst), I(list(lst = list(1L, 2L))))) +(cl3 = substitute2(f(lst), list(lst = I(list(1L, 2L))))) +(cl4 = substitute2(f(lst), list(lst = quote(list(1L, 2L))))) +(cl5 = substitute2(f(lst), list(lst = list(1L, 2L)))) +cl1[[2L]] ## base R substitute with list element +cl2[[2L]] ## same +cl3[[2L]] ## same +cl4[[2L]] ## desired +cl5[[2L]] ## automatically + +## character to name and list into list calls works recursively +(cl1 = substitute2(f(lst), list(lst = list(1L, list(2L))))) +(cl2 = substitute2(f(lst), I(list(lst = list(1L, list(2L)))))) ## unless I() used +last(cl1[[2L]]) ## enlisted recursively +last(cl2[[2L]]) ## AsIs + +## using substitute2 from another function +f = function(expr, env) { + eval(substitute( + substitute2(.expr, env), + list(.expr = substitute(expr)) + )) +} +f(list(var1 = var2), list(var1 = "c1", var2 = 5L)) +} +\keyword{ data } diff --git a/man/test.data.table.Rd b/man/test.data.table.Rd index e84ae4797d..ba0fe25f9c 100644 --- a/man/test.data.table.Rd +++ b/man/test.data.table.Rd @@ -25,4 +25,9 @@ test.data.table(script = "tests.Rraw", verbose = FALSE, pkg = ".", If all tests were successful, \code{TRUE} is returned. Otherwise, see the \code{silent} argument above. \code{silent=TRUE} is intended for use at the start of production scripts; e.g. \code{stopifnot(test.data.table(silent=TRUE))} to check \code{data.table} is passing its own tests before proceeding. } \seealso{ \code{\link{data.table}}, \code{\link{test}} } +\examples{ + \dontrun{ + test.data.table() + } +} \keyword{ data } diff --git a/po/R-data.table.pot b/po/R-data.table.pot index 8e6d641240..ad00f12772 100644 --- a/po/R-data.table.pot +++ b/po/R-data.table.pot @@ -106,6 +106,9 @@ msgstr "" msgid "trying to use integer64 class when 'bit64' package is not installed" msgstr "" +msgid "optimised between not available for this data type, fallback to slow R routine" +msgstr "" + msgid "Not yet implemented NAbounds=TRUE for this non-numeric and non-character type" msgstr "" @@ -130,57 +133,99 @@ msgstr "" msgid "the second element should be the upper bound(s)." msgstr "" -msgid "x." +msgid "forderv(query) took ..." +msgstr "" + +msgid "Generating final logical vector ..." +msgstr "" + +msgid "done in" +msgstr "" + +msgid "%s is type %s which is not supported by data.table join" +msgstr "" + +msgid "Attempting roll join on factor column when joining %s to %s. Only integer, double or character columns may be roll joined." +msgstr "" + +msgid "Matching %s factor levels to %s factor levels." msgstr "" -msgid "is type" +msgid "Coercing factor column %s to type character to match type of %s." msgstr "" -msgid "which is not supported by data.table join" +msgid "Matching character column %s to factor levels in %s." msgstr "" -msgid "i." +msgid "Incompatible join types: %s (%s) and %s (%s). Factor columns must join to factor or character columns." msgstr "" -msgid "Attempting roll join on factor column when joining x." +msgid "%s has same type (%s) as %s. No coercion needed." msgstr "" -msgid "to i." +msgid "Coercing all-NA %s (%s) to type %s to match type of %s." msgstr "" -msgid ". Only integer, double or character columns may be roll joined." +msgid "Incompatible join types: %s (%s) and %s (%s)" msgstr "" -msgid "Incompatible join types: x." +msgid "Coercing %s column %s%s to type integer64 to match type of %s." msgstr "" -msgid "(" +msgid "Incompatible join types: %s is type integer64 but %s is type double and contains fractions" msgstr "" -msgid ") and i." +msgid "Coercing double column %s (which contains no fractions) to type integer to match type of %s" msgstr "" -msgid "). Factor columns must join to factor or character columns." +msgid "Coercing integer column %s to type double to match type of %s which contains fractions." msgstr "" -msgid ")" +msgid "Coercing integer column %s to type double for join to match type of %s." msgstr "" -msgid "Incompatible join types:" +msgid "on= matches existing key, using key" msgstr "" -msgid "is type integer64 but" +msgid "on= matches existing index, using index" msgstr "" -msgid "is type double and contains fractions" +msgid "Calculated ad hoc index in %s" +msgstr "" + +msgid "Non-equi join operators detected ..." msgstr "" msgid "roll is not implemented for non-equi joins yet." msgstr "" +msgid "forder took ..." +msgstr "" + +msgid "Generating group lengths ..." +msgstr "" + +msgid "Generating non-equi group ids ..." +msgstr "" + msgid "Column name '_nqgrp_' is reserved for non-equi joins." msgstr "" +msgid "Recomputing forder with non-equi ids ..." +msgstr "" + +msgid "Found %d non-equi group(s) ..." +msgstr "" + +msgid "Starting bmerge ..." +msgstr "" + +msgid "bmerge done in" +msgstr "" + +msgid "cedta decided '%s' wasn't data.table aware. Here is call stack with [[1L]] applied:" +msgstr "" + msgid "key argument of data.table() must be character" msgstr "" @@ -322,12 +367,27 @@ msgstr "" msgid "Attempting to do natural join but no common columns in provided tables" msgstr "" -msgid "Internal error. Cannot by=.EACHI when joining to a secondary key, yet" +msgid "Joining but 'x' has no key, natural join using" +msgstr "" + +msgid "not-join called with 'by=.EACHI'; Replacing !i with i=setdiff_(x,i) ..." +msgstr "" + +msgid "Constructing irows for '!byjoin || nqbyjoin' ..." +msgstr "" + +msgid "Internal error. Cannot by=.EACHI when joining to an index, yet" msgstr "" msgid "Internal error. irows has length in by=.EACHI" msgstr "" +msgid "Reorder irows for 'mult==\"all\" && !allGrp1' ..." +msgstr "" + +msgid "Reordering %d rows after bmerge done in ..." +msgstr "" + msgid "logical error. i is not a data.table, but 'on' argument is provided." msgstr "" @@ -349,6 +409,9 @@ msgstr "" msgid "Internal error: notjoin but byjoin or !integer or nomatch==NA" msgstr "" +msgid "Inverting irows for notjoin done in ..." +msgstr "" + msgid "with=FALSE together with := was deprecated in v1.9.4 released Oct 2014. Please wrap the LHS of := with parentheses; e.g., DT[,(myVar):=sum(b),by=a] to assign to column name(s) held in variable myVar. See ?':=' for other examples. As warned in 2014, this is now a warning." msgstr "" @@ -385,9 +448,18 @@ msgstr "" msgid "but one or more items include a comma. Either pass a vector of column names (which can contain spaces, but no commas), or pass a vector length 1 containing comma separated column names. See ?data.table for other possibilities." msgstr "" +msgid "by index '%s' but that index has 0 length. Ignoring." +msgstr "" + msgid "Internal error: irows isn't integer" msgstr "" +msgid "i clause present and columns used in by detected, only these subset:" +msgstr "" + +msgid "i clause present but columns used in by not detected. Having to subset all columns before evaluating 'by': '" +msgstr "" + msgid "'by' appears to evaluate to column names but isn't c() or key(). Use by=list(...) if you can. Otherwise, by=eval" msgstr "" @@ -409,6 +481,9 @@ msgstr "" msgid "The items in the 'by' or 'keyby' list are length(s) (%s). Each must be length %d; the same length as there are rows in x (after subsetting if i is provided)." msgstr "" +msgid "by-expression '%s' is not named, and the auto-generated name '%s' clashed with variable(s) in j. Therefore assigning the entire by-expression as name." +msgstr "" + msgid "Internal error: drop_dot passed" msgstr "" @@ -457,6 +532,15 @@ msgstr "" msgid "This j doesn't use .SD but .SDcols has been supplied. Ignoring .SDcols. See ?data.table." msgstr "" +msgid "Detected that j uses these columns:" +msgstr "" + +msgid "'(m)get' found in j. ansvars being set to all columns. Use .SDcols or a single j=eval(macro) instead. Both will detect the columns used which is important for efficiency.\nOld:" +msgstr "" + +msgid "New:" +msgstr "" + msgid ".SD is locked. Using := in .SD's j is reserved for possible future use; a tortuously flexible way to modify by group. Use := in j directly to modify by group by reference." msgstr "" @@ -472,9 +556,18 @@ msgstr "" msgid "LHS of := isn't column names ('character') or positions ('integer' or 'numeric')" msgstr "" +msgid "No rows match i. No new columns to add so not evaluating RHS of :=\nAssigning to 0 row subset of %d rows" +msgstr "" + msgid "Invalid .internal.selfref detected and fixed by taking a (shallow) copy of the data.table so that := can add this new column by reference. At an earlier point, this data.table has been copied by R (or was created manually using structure() or similar). Avoid names<- and attr<- which in R currently (and oddly) may copy the whole data.table. Use set* syntax instead to avoid copying: ?set, ?setnames and ?setattr. If this message doesn't help, please report your use case to the data.table issue tracker so the root cause can be fixed or this message improved." msgstr "" +msgid "Growing vector of column pointers from truelength %d to %d. A shallow copy has been taken, see ?setalloccol. Only a potential issue if two variables point to the same data (we can't yet detect that well) and if not you can safely ignore this. To avoid this message you could setalloccol() first, deep copy first using copy(), wrap with suppressWarnings() or increase the 'datatable.alloccol' option." +msgstr "" + +msgid "Note that the shallow copy will assign to the environment from which := was called. That means for example that if := was called within a function, the original table may be unaffected." +msgstr "" + msgid "Cannot assign to an under-allocated recursively indexed list -- L[[i]][,:=] syntax is only valid when i is length 1, but it's length" msgstr "" @@ -517,24 +610,72 @@ msgstr "" msgid "The column '.I' can't be grouped because it conflicts with the special .I variable. Try setnames(DT,'.I','I') first." msgstr "" +msgid "Note: forcing units=\"secs\" on implicit difftime by group; call difftime explicitly to choose custom units" +msgstr "" + msgid "logical error. i is not data.table, but mult='all' and 'by'=.EACHI" msgstr "" msgid "Internal error: by= is missing" msgstr "" +msgid "Finding groups using forderv ..." +msgstr "" + +msgid "Finding group sizes from the positions (can be avoided to save RAM) ..." +msgstr "" + +msgid "Getting back original order ..." +msgstr "" + +msgid "Finding groups using uniqlist on key ..." +msgstr "" + msgid "Internal error: byindex not the index name" msgstr "" +msgid "Finding groups using uniqlist on index '%s' ..." +msgstr "" + msgid "Internal error: byindex not found" msgstr "" +msgid "lapply optimization changed j from '%s' to '%s'" +msgstr "" + +msgid "lapply optimization is on, j unchanged as '%s'" +msgstr "" + +msgid "GForce optimized j to '" +msgstr "" + +msgid "GForce is on, left j unchanged" +msgstr "" + msgid "Unable to optimize call to mean() and could be very slow. You must name 'na.rm' like that otherwise if you do mean(x,TRUE) the TRUE is taken to mean 'trim' which is the 2nd argument of mean. 'trim' is not yet optimized." msgstr "" +msgid "Old mean optimization changed j from '%s' to '%s'" +msgstr "" + +msgid "Old mean optimization is on, left j unchanged." +msgstr "" + +msgid "All optimizations are turned off" +msgstr "" + +msgid "Optimization is on but left j unchanged (single plain symbol): '%s'" +msgstr "" + msgid "Internal error: length(irows)!=length(o__)" msgstr "" +msgid "Making each group and running j (GForce %s) ..." +msgstr "" + +msgid "setkey() after the := with keyby= ..." +msgstr "" + msgid "The setkey() normally performed by keyby= has been skipped (as if by= was used) because := is being used together with keyby= but the keyby= contains some expressions. To avoid this warning, use by= instead, or provide existing column names to keyby=." msgstr "" @@ -547,6 +688,9 @@ msgstr "" msgid "and bynames is" msgstr "" +msgid "setkey() afterwards for keyby=.EACHI ..." +msgstr "" + msgid "rownames and rownames.value cannot both be used at the same time" msgstr "" @@ -649,6 +793,9 @@ msgstr "" msgid "Argument 'by' must refer only to atomic-type columns, but the following columns are non-atomic:" msgstr "" +msgid "Processing split.data.table with:" +msgstr "" + msgid "x is not a data.table. Shallow copy is a copy of the vector of column pointers (only), so is only meaningful for data.table" msgstr "" @@ -820,6 +967,21 @@ msgstr "" msgid "Internal error in .isFastSubsettable. Please report to data.table developers" msgstr "" +msgid "Subsetting optimization disabled because the cross-product of RHS values exceeds 1e4, causing memory problems." +msgstr "" + +msgid "Optimized subsetting with key '" +msgstr "" + +msgid "Optimized subsetting with index '" +msgstr "" + +msgid "Creating new index '" +msgstr "" + +msgid "Creating index %s done in ..." +msgstr "" + msgid "'on' argument should be a named atomic vector of column names indicating which columns in 'i' should be joined with which columns in 'x'." msgstr "" @@ -850,6 +1012,9 @@ msgstr "" msgid "There is no package %s in provided repository." msgstr "" +msgid "Git revision is not available. Most likely data.table was installed from CRAN or local archive.\nGit revision is available when installing from our repositories 'https://Rdatatable.gitlab.io/data.table' and 'https://Rdatatable.github.io/data.table'." +msgstr "" + msgid "'fromLast' must be TRUE or FALSE" msgstr "" @@ -949,6 +1114,9 @@ msgstr "" msgid "Please provide a name to each element of 'measure.vars'." msgstr "" +msgid "Duplicate column names found in molten data.table. Setting unique names using 'make.names'" +msgstr "" + msgid "y and x must both be data.tables. Use `setDT()` to convert list/data.frames to data.tables by reference or as.data.table() to convert to data.tables by copying." msgstr "" @@ -1042,6 +1210,12 @@ msgstr "" msgid "POSIXct interval cols have mixed timezones. Overlaps are performed on the internal numerical representation of POSIXct objects (always in UTC epoch time), therefore printed values may give the impression that values don't overlap but their internal representations do Please ensure that POSIXct type interval cols have identical 'tzone' attributes to avoid confusion." msgstr "" +msgid "unique() + setkey() operations done in ..." +msgstr "" + +msgid "binary search(es) done in ..." +msgstr "" + msgid "Not yet implemented" msgstr "" @@ -1171,6 +1345,9 @@ msgstr "" msgid "\". Please double check the input file is a valid csvy." msgstr "" +msgid "Processed %d lines of YAML metadata with the following top-level fields: %s" +msgstr "" + msgid "User-supplied 'header' will override that found in metadata." msgstr "" @@ -1231,6 +1408,9 @@ msgstr "" msgid "so the column has been left as type '" msgstr "" +msgid "stringsAsFactors=%s converted %d column(s): %s" +msgstr "" + msgid "key argument of data.table() must be a character vector naming columns (NB: col.names are applied before this)" msgstr "" @@ -1249,6 +1429,9 @@ msgstr "" msgid "x being coerced from class: matrix to data.table" msgstr "" +msgid "Appending to existing file so setting bom=FALSE and yaml=FALSE" +msgstr "" + msgid "Input has no columns; doing nothing." msgstr "" @@ -1315,6 +1498,9 @@ msgstr "" msgid "Using integer64 class columns require to have 'bit64' package installed." msgstr "" +msgid "%s: using %s: %s" +msgstr "" + msgid "'xts' class passed to %s function but 'xts' is not available, you should have 'xts' installed already" msgstr "" @@ -1408,19 +1594,7 @@ msgstr "" msgid "The option 'datatable.nomatch' is being used and is not set to the default NA. This option is still honored for now but will be deprecated in future. Please see NEWS for 1.12.4 for detailed information and motivation. To specify inner join, please specify `nomatch=NULL` explicitly in your calls rather than changing the default using this option." msgstr "" -msgid "The datatable." -msgstr "" - -msgid "version (" -msgstr "" - -msgid ") does not match the package (" -msgstr "" - -msgid "). Please close all R sessions to release the old" -msgstr "" - -msgid "and reinstall data.table in a fresh R session. The root cause is that R's package installer can in some unconfirmed circumstances leave a package in a state that is apparently functional but where new R code is calling old C code silently: https://bugs.r-project.org/bugzilla/show_bug.cgi?id=17478. Once a package is in this mismatch state it may produce wrong results silently until you next upgrade the package. Please help by adding precise circumstances to 17478 to move the status to confirmed. This mismatch between R and C code can happen with any package not just data.table. It is just that data.table has added this check." +msgid "The datatable.%s version (%s) does not match the package (%s). Please close all R sessions to release the old %s and reinstall data.table in a fresh R session. The root cause is that R's package installer can in some unconfirmed circumstances leave a package in a state that is apparently functional but where new R code is calling old C code silently: https://bugs.r-project.org/bugzilla/show_bug.cgi?id=17478. Once a package is in this mismatch state it may produce wrong results silently until you next upgrade the package. Please help by adding precise circumstances to 17478 to move the status to confirmed. This mismatch between R and C code can happen with any package not just data.table. It is just that data.table has added this check." msgstr "" msgid "This is R" @@ -1474,6 +1648,15 @@ msgstr "" msgid "Column classes will be suppressed when col.names is 'none'" msgstr "" +msgid "Key: <%s>" +msgstr "" + +msgid "Null data.%s (0 rows and 0 cols)" +msgstr "" + +msgid "Empty data.%s (%d rows and %d cols)" +msgstr "" + msgid "Internal structure doesn't seem to be a list. Possibly corrupt data.table." msgstr "" @@ -1516,6 +1699,18 @@ msgstr "" msgid "Internal error. 'cols' should be character at this point in setkey; please report." msgstr "" +msgid "forder took" +msgstr "" + +msgid "setkey on columns %s using existing index '%s'" +msgstr "" + +msgid "reorder took" +msgstr "" + +msgid "x is already ordered by these columns, no need to call reorder" +msgstr "" + msgid "Internal error: index '" msgstr "" @@ -1576,25 +1771,13 @@ msgstr "" msgid "length(by.x) != length(by.y)" msgstr "" -msgid "When x's column ('" -msgstr "" - -msgid "') is character, the corresponding column in y ('" -msgstr "" - -msgid "') should be factor or character, but found incompatible type '" -msgstr "" - -msgid "') is factor, the corresponding column in y ('" +msgid "When x's column ('%s') is character, the corresponding column in y ('%s') should be factor or character, but found incompatible type '%s'." msgstr "" -msgid "') should be character or factor, but found incompatible type '" +msgid "When x's column ('%s') is factor, the corresponding column in y ('%s') should be character or factor, but found incompatible type '%s'." msgstr "" -msgid "') is integer or numeric, the corresponding column in y ('" -msgstr "" - -msgid "') can not be character or logical types, but found incompatible type '" +msgid "When x's column ('%s') is integer or numeric, the corresponding column in y ('%s') can not be character or logical types, but found incompatible type '%s'." msgstr "" msgid "argument 'all' should be logical of length one" @@ -1645,12 +1828,18 @@ msgstr "" msgid "argument 'fill' ignored, only make sense for type='const'" msgstr "" +msgid "No objects of class data.table exist in %s" +msgstr "" + msgid "order.col='" msgstr "" msgid "' not a column name of info" msgstr "" +msgid "Total:" +msgstr "" + msgid "data.table package is loaded. Unload or start a fresh R session." msgstr "" @@ -1660,25 +1849,31 @@ msgstr "" msgid "Neither %s nor %s exist in %s" msgstr "" +msgid "test.data.table() running:" +msgstr "" + +msgid "**** This R session's language is not English. Each test will still check that the correct number of errors and/or\n**** warnings are produced. However, to test the text of each error/warning too, please restart R with LANGUAGE=en" +msgstr "" + msgid "Failed after test" msgstr "" msgid "before the next test() call in" msgstr "" -msgid "out of" +msgid "Timings count mismatch:" msgstr "" -msgid ". Search" +msgid "vs" msgstr "" -msgid "for test number" +msgid "10 longest running tests took" msgstr "" -msgid "Timings count mismatch:" +msgid "All %d tests in %s completed ok in %s" msgstr "" -msgid "vs" +msgid "Running test id %s" msgstr "" msgid "Test" @@ -1687,6 +1882,33 @@ msgstr "" msgid "is invalid: when error= is provided it does not make sense to pass y as well" msgstr "" +msgid "Test id %s is not in increasing order" +msgstr "" + +msgid "Test %s produced %d %ss but expected %d" +msgstr "" + +msgid "Test %s didn't produce the correct %s:\nExpected: %s\nObserved: %s" +msgstr "" + +msgid "Output captured before unexpected warning/error/message:" +msgstr "" + +msgid "Test %s did not produce the correct output:\nExpected: <<%s>>\nObserved <<%s>>" +msgstr "" + +msgid "Test %s produced output but should not have:\nExpected absent (case insensitive): <<%s>>\nObserved: <<%s>>" +msgstr "" + +msgid "Test %s ran without errors but selfrefok(%s) is FALSE" +msgstr "" + +msgid "Test %s ran without errors but failed check that x equals y:" +msgstr "" + +msgid "First %d of %d (type '%s'):" +msgstr "" + msgid "Use started.at=proc.time() not Sys.time() (POSIXt and slow)" msgstr "" @@ -1756,7 +1978,17 @@ msgstr "" msgid "Following columns are not numeric and will be omitted:" msgstr "" +msgid "Index: " +msgid_plural "Indices: " +msgstr[0] "" +msgstr[1] "" + msgid "%d variable not shown: %s\n" msgid_plural "%d variables not shown: %s\n" msgstr[0] "" msgstr[1] "" + +msgid "%d error out of %d. Search %s for test number %s" +msgid_plural "%d errors out of %d. Search %s for test numbers %s" +msgstr[0] "" +msgstr[1] "" diff --git a/po/R-zh_CN.po b/po/R-zh_CN.po index a73b8e4a1b..7e78584fd7 100644 --- a/po/R-zh_CN.po +++ b/po/R-zh_CN.po @@ -136,6 +136,11 @@ msgstr "。将采用 UTC 时间进行比较。" msgid "trying to use integer64 class when 'bit64' package is not installed" msgstr "试图使用 intger64 类型但 'bit64' 包尚未安装" +msgid "" +"optimised between not available for this data type, fallback to slow R " +"routine" +msgstr "对这种数据类型的优化尚未实现,使用备用较慢的R方法。" + msgid "" "Not yet implemented NAbounds=TRUE for this non-numeric and non-character type" msgstr "" @@ -165,61 +170,118 @@ msgstr "第一个元素应为下界;" msgid "the second element should be the upper bound(s)." msgstr "第二个元素应为上界。" -msgid "x." -msgstr "x." +msgid "forderv(query) took ..." +msgstr "forderv(query) 用了 ..." + +msgid "Generating final logical vector ..." +msgstr "产生最后的逻辑向量 ..." -msgid "is type" -msgstr "的类型为" +msgid "done in" +msgstr "用了" -msgid "which is not supported by data.table join" -msgstr ",该类型无法用于 data.table 的联接" +msgid "%s is type %s which is not supported by data.table join" +msgstr "%s的类型为%s,该类型无法用于 data.table 的联接" + +msgid "" +"Attempting roll join on factor column when joining %s to %s. Only integer, " +"double or character columns may be roll joined." +msgstr "" +"联接%s与%s时试图滚动联接(roll join)因子类型(factor)的列。但只有整数" +"(integer)、双精度(double)或字符(character)类型的列可以使用滚动联接。" -msgid "i." -msgstr "i." +msgid "Matching %s factor levels to %s factor levels." +msgstr "匹配 %s 的因子水平和 %s 的因子水平。" -msgid "Attempting roll join on factor column when joining x." -msgstr "试图滚动联接(roll join)因子类型(factor)的列,这发生于将 x." +msgid "Coercing factor column %s to type character to match type of %s." +msgstr "将因子类型列 %s 强制转换成字符来匹配目 %s。" -msgid "to i." -msgstr "与 i." +msgid "Matching character column %s to factor levels in %s." +msgstr "匹配字符类型列 %s 和 %s 的因子水平。" -msgid ". Only integer, double or character columns may be roll joined." +msgid "" +"Incompatible join types: %s (%s) and %s (%s). Factor columns must join to " +"factor or character columns." msgstr "" -"联接时。但只有整数(integer)、双精度(double)或字符(character)类型的列可" -"以使用滚动联接(roll join)。" +"不兼容的联结类型: %s (%s) 和 %s (%s)。 因子类型的列必须与因子类型或字符类型的" +"列才可以联结" -msgid "Incompatible join types: x." -msgstr "不兼容的联结类型: x。" +msgid "%s has same type (%s) as %s. No coercion needed." +msgstr "%s 有 %s 的类型。不需要强制转换。" -msgid "(" -msgstr "(" +msgid "Coercing all-NA %s (%s) to type %s to match type of %s." +msgstr "强制转换 all-NA %s (%s) 为 %s 类型用来匹配 %s 类型。" -msgid ") and i." -msgstr ")和 i。" +msgid "Incompatible join types: %s (%s) and %s (%s)" +msgstr "不兼容的联结类型: %s (%s) 和 %s (%s)。" -msgid "). Factor columns must join to factor or character columns." -msgstr ")。 因子类型的列必须与因子类型或字符类型的列才可以联结" +msgid "Coercing %s column %s%s to type integer64 to match type of %s." +msgstr "强制转换 %s 个列 %s%s 为整数64类型用来匹配 %s 类型。" -msgid ")" -msgstr ")" +msgid "" +"Incompatible join types: %s is type integer64 but %s is type double and " +"contains fractions" +msgstr "" +"不兼容的联结类型: %s 是 integer64 类型的列但 %s 是有分数的双精度类型列。" -msgid "Incompatible join types:" -msgstr "不兼容的联结类型" +msgid "" +"Coercing double column %s (which contains no fractions) to type integer to " +"match type of %s" +msgstr "强制转换双精度列 %s (不含有分数) 为整数用来匹配 %s 类型" -msgid "is type integer64 but" -msgstr "是 integer64 类型但是" +msgid "" +"Coercing integer column %s to type double to match type of %s which contains " +"fractions." +msgstr "强制转换整数列 %s 为双精度用来匹配含有分数的 %s 类型。" + +msgid "Coercing integer column %s to type double for join to match type of %s." +msgstr "强制转换整数列 %s 为双精度用来与类型 %s 进行联结。" + +msgid "on= matches existing key, using key" +msgstr "on=和现有键(key)相等,用键" + +msgid "on= matches existing index, using index" +msgstr "on=和现有索引(index)相等,用索引" + +msgid "Calculated ad hoc index in %s" +msgstr "计算临时索引用了 %s" -msgid "is type double and contains fractions" -msgstr "是 double 类型并且包含分数" +msgid "Non-equi join operators detected ..." +msgstr "侦测到不等长联结操作符(operator)..." msgid "roll is not implemented for non-equi joins yet." msgstr "不等长联结还不能执行 roll " +msgid "forder took ..." +msgstr "forder 用了 ..." + +msgid "Generating group lengths ..." +msgstr "正在生成组的长度。。。" + +msgid "Generating non-equi group ids ..." +msgstr "正在生成不等长的组标识符 . . . " + msgid "Column name '_nqgrp_' is reserved for non-equi joins." msgstr "列名 '_nqgrp_' 是为不等长联结保留的" +msgid "Recomputing forder with non-equi ids ..." +msgstr "用不等长的组标志符重新计算 forder . . . " + +msgid "Found %d non-equi group(s) ..." +msgstr "找到%d不等长分组 ..." + +msgid "Starting bmerge ..." +msgstr "bmerge开始..." + +msgid "bmerge done in" +msgstr "bmerge 用了" + +msgid "" +"cedta decided '%s' wasn't data.table aware. Here is call stack with [[1L]] " +"applied:" +msgstr "cedta决定data.table不识别 '%s'。使用[[1L]]后的呼叫堆叠就是:" + msgid "key argument of data.table() must be character" -msgstr "data.table() 的主参数必须是字符" +msgstr "data.table() 的key参数必须是字符" msgid "Object '" msgstr "对象 '" @@ -427,18 +489,34 @@ msgid "" msgstr "" "但i是一个 data.table (或者是字符向量),必须使用 'on=' 参数指明参与连接的列 " "(参见 ?data.table),可以是keying x(比如,已排序过,和标记已排序过,请参见?" -"setkey),或者是在x和i共用列的名字(比如,自然连接)。如果x有在内存被排序过," -"Keyed连接的速度会在非常大的数据上有较明显的提高。" +"setkey),或者是在x和i共用列的名字(比如,自然连接)。如果x有在内存被排序过,键" +"(keyed)连接的速度会在非常大的数据上有较明显的提高。" msgid "Attempting to do natural join but no common columns in provided tables" msgstr "尝试进行自然连接然而并没有找到表格中相同的列" -msgid "Internal error. Cannot by=.EACHI when joining to a secondary key, yet" -msgstr "内部错误:目前尚无法对次键使用by=.EACH命令" +msgid "Joining but 'x' has no key, natural join using" +msgstr "联结但 'x' 没有键 (key),自然联结用" + +msgid "not-join called with 'by=.EACHI'; Replacing !i with i=setdiff_(x,i) ..." +msgstr "" +"配套使用了 not-join 和 'by=.EACHI' 的命令; 用 !i 取代 i=setdiff_(x,i) ..." + +msgid "Constructing irows for '!byjoin || nqbyjoin' ..." +msgstr "构造 irows 用来对应于 '!byjoin || nqbyjoin' ..." + +msgid "Internal error. Cannot by=.EACHI when joining to an index, yet" +msgstr "内部错误:目前尚无法对索引(index)使用by=.EACH命令" msgid "Internal error. irows has length in by=.EACHI" msgstr "内部错误:by=.EACHI 中 irows 有长度" +msgid "Reorder irows for 'mult==\"all\" && !allGrp1' ..." +msgstr "对'mult==\"all\" && !allGrp1'再排序irows ..." + +msgid "Reordering %d rows after bmerge done in ..." +msgstr "bmerge 之后再排序%d行用了..." + msgid "logical error. i is not a data.table, but 'on' argument is provided." msgstr "逻辑错误。当 i 并非一个 data.table时,不应提供'on'参数" @@ -465,6 +543,9 @@ msgstr "" msgid "Internal error: notjoin but byjoin or !integer or nomatch==NA" msgstr "内部错误。原因可能为:notjoin 而非 byjoin;非整数;nomatch 为空" +msgid "Inverting irows for notjoin done in ..." +msgstr "对 notjoin 求逆 irows 用了 ..." + msgid "" "with=FALSE together with := was deprecated in v1.9.4 released Oct 2014. " "Please wrap the LHS of := with parentheses; e.g., DT[,(myVar):=sum(b),by=a] " @@ -519,9 +600,22 @@ msgstr "" "包含逗号),或传入一个长度为1,由逗号分隔的列名组成的向量输入 ?data.table查看" "其他的选项。" +msgid "by index '%s' but that index has 0 length. Ignoring." +msgstr "by 索引(index) '%s' 但那索引的长度为0。将被忽视。" + msgid "Internal error: irows isn't integer" msgstr "内部错误:irows 不是整型" +msgid "i clause present and columns used in by detected, only these subset:" +msgstr "有 i 子句和在 by 用的列被侦测, 子集只有这个:" + +msgid "" +"i clause present but columns used in by not detected. Having to subset all " +"columns before evaluating 'by': '" +msgstr "" +"有 i 子句但是在 by 用的列并没有被侦测。于是所有的列将用于接下里的 'by': 运" +"算。" + msgid "" "'by' appears to evaluate to column names but isn't c() or key(). Use " "by=list(...) if you can. Otherwise, by=eval" @@ -560,6 +654,13 @@ msgstr "" "在'by'或'keyby'列表中的项长度为 %s。每一项的长度须均为%d,即应与 x (或经 i " "筛选后的子集)中所包含行数相同。" +msgid "" +"by-expression '%s' is not named, and the auto-generated name '%s' clashed " +"with variable(s) in j. Therefore assigning the entire by-expression as name." +msgstr "" +"by-expression '%s' 没有命名,自动生成的名字 '%s' 与 j 中的变量名冲突。将用 " +"by-expression 用来命名。" + msgid "Internal error: drop_dot passed" msgstr "内部错误:drop_dot 传入的参数有" @@ -622,6 +723,22 @@ msgid "" "data.table." msgstr "此处 j 不使用 .SD 但提供了 .SDcols ,因此忽略 .SDcols详见 ?data.table" +msgid "Detected that j uses these columns:" +msgstr "侦测 j 用这个列:" + +msgid "" +"'(m)get' found in j. ansvars being set to all columns. Use .SDcols or a " +"single j=eval(macro) instead. Both will detect the columns used which is " +"important for efficiency.\n" +"Old:" +msgstr "" +"j 中找到了 '(m)get'。ansvars 将应用到所有的列。请考虑使用 .SDcols 或者一个单" +"独的 j=eval(macro)两个命令都会侦测影响效率的列。\n" +"旧:" + +msgid "New:" +msgstr "新:" + msgid "" ".SD is locked. Using := in .SD's j is reserved for possible future use; a " "tortuously flexible way to modify by group. Use := in j directly to modify " @@ -647,6 +764,13 @@ msgid "" "'numeric')" msgstr ":= 的 LHS 不是列名('字符')或列的位置('整数'或'数值')" +msgid "" +"No rows match i. No new columns to add so not evaluating RHS of :=\n" +"Assigning to 0 row subset of %d rows" +msgstr "" +"没有找到匹配 i 的行。无法增加新的列所以无法运算 RHS of :=\n" +"指定一个 0 行的子集" + msgid "" "Invalid .internal.selfref detected and fixed by taking a (shallow) copy of " "the data.table so that := can add this new column by reference. At an " @@ -664,6 +788,28 @@ msgstr "" "及 ?setattr如果以上讯息无法提供帮助,请回报你的案例至 data.table 问题追踪以助" "于修复根本原因或改进本讯息" +msgid "" +"Growing vector of column pointers from truelength %d to %d. A shallow copy " +"has been taken, see ?setalloccol. Only a potential issue if two variables " +"point to the same data (we can't yet detect that well) and if not you can " +"safely ignore this. To avoid this message you could setalloccol() first, " +"deep copy first using copy(), wrap with suppressWarnings() or increase the " +"'datatable.alloccol' option." +msgstr "" +"列指针向量从 truelength %d 增加为 %d。浅拷贝已经完成,详见 ?setalloccol。如果" +"两个变量指向同一个数据 (这个我们无法侦测),会导致潜在的问题。如果并没有,你" +"可以:忽视这个问题。如果想要避免警告,可以使用以下任一命令,像是 " +"setalloccol(),用 copy() 深度拷贝,套用 suppressWarnings() 或者是增加 " +"'datatable.alloccol' 的选项。" + +msgid "" +"Note that the shallow copy will assign to the environment from which := was " +"called. That means for example that if := was called within a function, the " +"original table may be unaffected." +msgstr "" +"需要注意的是这个浅拷贝会被指向给调用了 which := 的环境。意思就是说,如果在函" +"数内部调用了 if :=, 原先的 table 可能不会有任何变化。" + msgid "" "Cannot assign to an under-allocated recursively indexed list -- L[[i]][,:=] " "syntax is only valid when i is length 1, but it's length" @@ -735,18 +881,52 @@ msgstr "" "无法对 '.I' 列进行分组,因为与 data.table 特有的 .I 变量冲突请先尝试 " "setnames(DT,'.I','I')" +msgid "" +"Note: forcing units=\"secs\" on implicit difftime by group; call difftime " +"explicitly to choose custom units" +msgstr "" +"注意:在隐含的 difftime 强制分组使用了 units=\"secs\"; 请明确的调用 difftime " +"来选择自定义的单位。" + msgid "logical error. i is not data.table, but mult='all' and 'by'=.EACHI" msgstr "逻辑错误: i 不是data.table,但 mult='all' 及 'by'=.EACHI" msgid "Internal error: by= is missing" msgstr "内部错误 : 缺少 by=" +msgid "Finding groups using forderv ..." +msgstr "搜寻组中配套使用了 forderv . . . " + +msgid "Finding group sizes from the positions (can be avoided to save RAM) ..." +msgstr "从位置中搜寻组的大小 (避免此举来节省内存) . . ." + +msgid "Getting back original order ..." +msgstr "恢复原有的顺序 . . . " + +msgid "Finding groups using uniqlist on key ..." +msgstr "搜寻组并配套使用了将 uniqlist 用在键 (key) ... " + msgid "Internal error: byindex not the index name" -msgstr "内部错误 : byindex 不是索引名称" +msgstr "内部错误 : byindex 不是索引(index)名称" + +msgid "Finding groups using uniqlist on index '%s' ..." +msgstr "搜寻组并配套使用了将 uniqlist 用在索引 (index) '%s'... " msgid "Internal error: byindex not found" msgstr "内部错误 : 找不到 byindex" +msgid "lapply optimization changed j from '%s' to '%s'" +msgstr "lapply优化改变j从'%s'成'%s'" + +msgid "lapply optimization is on, j unchanged as '%s'" +msgstr "lapply优化打开了, j ('%s')没有区别" + +msgid "GForce optimized j to '" +msgstr "GForce优化 j 到 '" + +msgid "GForce is on, left j unchanged" +msgstr "GForce打开了, j 没有区别" + msgid "" "Unable to optimize call to mean() and could be very slow. You must name 'na." "rm' like that otherwise if you do mean(x,TRUE) the TRUE is taken to mean " @@ -756,9 +936,27 @@ msgstr "" "果您直接使用 mean(x,TRUE)会被认定为 trim=TRUE,trim 是 mean() 中尚未被优化的" "第二顺位参数" +msgid "Old mean optimization changed j from '%s' to '%s'" +msgstr "旧mean优化改变j 从'%s'成'%s'" + +msgid "Old mean optimization is on, left j unchanged." +msgstr "旧mean优化打开了,j没有区别。" + +msgid "All optimizations are turned off" +msgstr "所有优化关掉了" + +msgid "Optimization is on but left j unchanged (single plain symbol): '%s'" +msgstr "优化打开了但是并没有改变 j (一个普通符号):'%s'" + msgid "Internal error: length(irows)!=length(o__)" msgstr "内部错误:length(irows)!=length(o__)" +msgid "Making each group and running j (GForce %s) ..." +msgstr "进行分组中,并且运行 j (GForce %s) ..." + +msgid "setkey() after the := with keyby= ..." +msgstr "keyby=中,:=后setkey() ..." + msgid "" "The setkey() normally performed by keyby= has been skipped (as if by= was " "used) because := is being used together with keyby= but the keyby= contains " @@ -778,6 +976,9 @@ msgstr "但是ans(答案)是" msgid "and bynames is" msgstr "同时bynames是" +msgid "setkey() afterwards for keyby=.EACHI ..." +msgstr "keyby=.EACHI中到底setkey() ..." + msgid "rownames and rownames.value cannot both be used at the same time" msgstr "rownames和rownames.value 不能同时使用" @@ -798,7 +999,7 @@ msgstr "" "行名长度为零,`length(rownames)==0`,但应该为单一列名,单一数值,或NULL" msgid "rownames is TRUE but key has multiple columns" -msgstr "rownames是TRUE但key不只一个列" +msgstr "rownames是TRUE但键(key)不只一个列" msgid "; taking first column x[,1] as rownames" msgstr "; 取第一列, `column x[,1]`, 为rownames" @@ -901,6 +1102,9 @@ msgid "" "columns are non-atomic:" msgstr "参数 'by' 只适用于原子类型的纵列,但现在关联的纵列不是原子类型" +msgid "Processing split.data.table with:" +msgstr "运行 split.data.table 中使用: " + msgid "" "x is not a data.table. Shallow copy is a copy of the vector of column " "pointers (only), so is only meaningful for data.table" @@ -1125,6 +1329,23 @@ msgid "" "Internal error in .isFastSubsettable. Please report to data.table developers" msgstr ".isFastSubsettable 产生了内部错误。请向 data.table 开发者报告" +msgid "" +"Subsetting optimization disabled because the cross-product of RHS values " +"exceeds 1e4, causing memory problems." +msgstr "筛选子集优化被停止,因为叉积后的RHS值将超过 1e4,会造成内存问题。" + +msgid "Optimized subsetting with key '" +msgstr "优化的子集用键(key) '" + +msgid "Optimized subsetting with index '" +msgstr "优化的子集用索引(index) '" + +msgid "Creating new index '" +msgstr "造成新索引(index) '" + +msgid "Creating index %s done in ..." +msgstr "造成新索引(index) %s 用了 ..." + msgid "" "'on' argument should be a named atomic vector of column names indicating " "which columns in 'i' should be joined with which columns in 'x'." @@ -1159,6 +1380,17 @@ msgstr "." msgid "There is no package %s in provided repository." msgstr "所提供的资料库中不含包%s" +msgid "" +"Git revision is not available. Most likely data.table was installed from " +"CRAN or local archive.\n" +"Git revision is available when installing from our repositories 'https://" +"Rdatatable.gitlab.io/data.table' and 'https://Rdatatable.github.io/data." +"table'." +msgstr "" +"Git 修订并不存在。可能是因为 data.table 是从 CRAN 或者是本地档案安装。\n" +"Git 修订存在的情况只限于从我们资料库 'https://Rdatatable.gitlab.io/data." +"table' 或者'https://Rdatatable.github.io/data.table'下载。" + msgid "'fromLast' must be TRUE or FALSE" msgstr "'fromLast' 必须为 TRUE 或 FALSE" @@ -1295,6 +1527,13 @@ msgstr "将被优先使用。" msgid "Please provide a name to each element of 'measure.vars'." msgstr "请为 'measure.vars' 中的每个元素提供一个名称。" +msgid "" +"Duplicate column names found in molten data.table. Setting unique names " +"using 'make.names'" +msgstr "" +"重复的列名存在于在 molten 之后 data.table。请使用 'make.names' 设置唯一的列" +"名。" + msgid "" "y and x must both be data.tables. Use `setDT()` to convert list/data.frames " "to data.tables by reference or as.data.table() to convert to data.tables by " @@ -1322,8 +1561,8 @@ msgid "" "'y' must be keyed (i.e., sorted, and, marked as sorted). Call setkey(y, ...) " "first, see ?setkey. Also check the examples in ?foverlaps." msgstr "" -"'y' 必须有主键(已经排序并且标记为已排序)。请先用 setkey(y, ...) 设置主键," -"可以参考 ?setkey 以及 ?foverlaps 中提供的例子。" +"'y' 必须有键(key:已经排序并且标记为已排序)。请先用 setkey(y, ...) 设置主" +"键,可以参考 ?setkey 以及 ?foverlaps 中提供的例子。" msgid "" "'by.x' and 'by.y' should contain at least two column names (or numbers) each " @@ -1354,7 +1593,7 @@ msgid "The first" msgstr "首先" msgid "columns of y's key must be identical to the columns specified in by.y." -msgstr "在'by.y'中,y键的列必须与指定的列相同" +msgstr "在'by.y'中,y键(key)的列必须与指定的列相同" msgid "Elements listed in 'by.x' must be valid names in data.table 'x'" msgstr "对于data.table中的'X','by.x'中的元素必须是有效名称" @@ -1434,6 +1673,12 @@ msgstr "" "显示却重叠'的印象,(所以)请确保POSIXct类型的间隔列具有相同的'时区'属性以避" "免混乱。" +msgid "unique() + setkey() operations done in ..." +msgstr "unique() + setkey() 执行用了 ..." + +msgid "binary search(es) done in ..." +msgstr "二进制搜索用了 . . . " + msgid "Not yet implemented" msgstr "尚未实现" @@ -1447,7 +1692,7 @@ msgid "length(na.last) > 1, only the first element will be used" msgstr "当na.last长度大于1时,只会使用第一个元素" msgid "x is a single vector, non-NULL 'cols' doesn't make sense" -msgstr "x是单个向量,非空的'cols'没有意义" +msgstr "x是单个向量,非NULL的'cols'没有意义" msgid "x is a list, 'cols' can not be 0-length" msgstr "x是一个list, 'cols'不能为0长度" @@ -1633,6 +1878,10 @@ msgstr "正则 \"" msgid "\". Please double check the input file is a valid csvy." msgstr "从这里开始" +msgid "" +"Processed %d lines of YAML metadata with the following top-level fields: %s" +msgstr "处理了YAML元数据中的排列最前的 %d 行: %s" + msgid "User-supplied 'header' will override that found in metadata." msgstr "用户提供的'header'将覆盖元数据中的表头" @@ -1699,11 +1948,14 @@ msgstr ":" msgid "so the column has been left as type '" msgstr "所以该列已经被保存为类型" +msgid "stringsAsFactors=%s converted %d column(s): %s" +msgstr "stringsAsFactors=%s 改变 %d 列: %s" + msgid "" "key argument of data.table() must be a character vector naming columns (NB: " "col.names are applied before this)" msgstr "" -"data.table()的关键参数必须是字符向量命名的列(NB:col.names在这之前被使用过)" +"data.table()的key参数必须是字符向量命名的列(NB:col.names在这之前被使用过)" msgid "" "index argument of data.table() must be a character vector naming columns " @@ -1725,6 +1977,9 @@ msgstr "" msgid "x being coerced from class: matrix to data.table" msgstr "x 的类将强制从 matrix 转变为 data.table" +msgid "Appending to existing file so setting bom=FALSE and yaml=FALSE" +msgstr "并入了已存在的文件,所以设置 bom=FALSE 和 yaml=FALSE" + msgid "Input has no columns; doing nothing." msgstr "输入没有列,不执行任何操作。" @@ -1820,6 +2075,9 @@ msgid "" "Using integer64 class columns require to have 'bit64' package installed." msgstr "要在列中使用 integer64 类,需要先安装 'bit64' 包。" +msgid "%s: using %s: %s" +msgstr "%s: 用 %s: %s" + msgid "" "'xts' class passed to %s function but 'xts' is not available, you should " "have 'xts' installed already" @@ -1895,7 +2153,7 @@ msgid "" "**********" msgstr "" "**********\n" -"用中文运行data.table。软件包只提供英语支持。当在在线搜索帮助时,也要确保检查" +"用中文执行data.table。软件包只提供英语支持。当在在线搜索帮助时,也要确保检查" "英语错误信息。这个可以通过查看软件包源文件中的po/R-zh_CN.po和po/zh_CN.po文件" "获得,这个文件可以并排找到母语和英语错误信息。\n" "**********" @@ -1963,35 +2221,26 @@ msgstr "" "用,但在未来不会被使用。相关的详细信息和动机,请参阅1.12.4的信息。要指定内部" "连接,请在调用中明确指定`nomatch = NULL`,而不要使用此选项更改默认值。" -msgid "The datatable." -msgstr "datatable" - -msgid "version (" -msgstr "版本(" - -msgid ") does not match the package (" -msgstr ")和包不匹配 (" - -msgid "). Please close all R sessions to release the old" -msgstr ").请关闭所有R会话以释放旧版本" - msgid "" -"and reinstall data.table in a fresh R session. The root cause is that R's " -"package installer can in some unconfirmed circumstances leave a package in a " -"state that is apparently functional but where new R code is calling old C " -"code silently: https://bugs.r-project.org/bugzilla/show_bug.cgi?id=17478. " -"Once a package is in this mismatch state it may produce wrong results " -"silently until you next upgrade the package. Please help by adding precise " -"circumstances to 17478 to move the status to confirmed. This mismatch " -"between R and C code can happen with any package not just data.table. It is " -"just that data.table has added this check." -msgstr "" -"并在全新的R会话中重新安装data.table。根本原因是R包安装程序可能在某些未经确认" -"的条件下将包置于显然可以正常工作的状态,但是新的R代码正在默默地调用旧的C代" -"码:https://bugs.r-project.org/bugzilla/show_bug.cgi?id=17478。一旦安装包处于" -"这不匹配的状态下,在您下次升级程序包之前,它可能会默默地产生错误的结果请提交" -"具体的情况至17478协助我们确认这个Bug。R和C代码之间的这种不匹配可能发生在任何" -"包中,而不仅仅是在data.table中。只是data.table添加了这个检查" +"The datatable.%s version (%s) does not match the package (%s). Please close " +"all R sessions to release the old %s and reinstall data.table in a fresh R " +"session. The root cause is that R's package installer can in some " +"unconfirmed circumstances leave a package in a state that is apparently " +"functional but where new R code is calling old C code silently: https://bugs." +"r-project.org/bugzilla/show_bug.cgi?id=17478. Once a package is in this " +"mismatch state it may produce wrong results silently until you next upgrade " +"the package. Please help by adding precise circumstances to 17478 to move " +"the status to confirmed. This mismatch between R and C code can happen with " +"any package not just data.table. It is just that data.table has added this " +"check." +msgstr "" +"data.table.%s版本(%s)和包不匹配版本(%s)。请关闭所有R会话以释放旧%s并在全新的R" +"会话中重新安装data.table。根本原因是R包安装程序可能在某些未经确认的条件下将包" +"置于显然可以正常工作的状态,但是新的R代码正在默默地调用旧的C代码:https://" +"bugs.r-project.org/bugzilla/show_bug.cgi?id=17478。一旦安装包处于这不匹配的状" +"态下,在您下次升级程序包之前,它可能会默默地产生错误的结果请提交具体的情况至" +"17478协助我们确认这个Bug。R和C代码之间的这种不匹配可能发生在任何包中,而不仅" +"仅是在data.table中。只是data.table添加了这个检查" msgid "This is R" msgstr "这是R" @@ -2059,6 +2308,15 @@ msgstr "对col.names有效的参数为'auto', 'top', and 'none'" msgid "Column classes will be suppressed when col.names is 'none'" msgstr "当col.names为'none'时,列的类型将被抑制" +msgid "Key: <%s>" +msgstr "键(key): <%s>" + +msgid "Null data.%s (0 rows and 0 cols)" +msgstr "NULL data.%s (0行,0列)" + +msgid "Empty data.%s (%d rows and %d cols)" +msgstr "空的 data.%s (%d行,%d列)" + msgid "" "Internal structure doesn't seem to be a list. Possibly corrupt data.table." msgstr "内部类型可能不是一个列表,该操作可能会损坏data.table" @@ -2088,8 +2346,8 @@ msgid "" "the original data's order by group. Try setindex() instead. Or, set*(copy(." "SD)) as a (slow) last resort." msgstr "" -"在.SD设置一个物理的键的功能被保留,以备未来的需求; 如需通过分组修改原数据顺序" -"请使用setindex(), 或者set*(copy(.SD))作为最终(该方式缓慢)的方法" +"在.SD设置一个物理的键(key)的功能被保留,以备未来的需求; 如需通过分组修改原数" +"据顺序请使用setindex(), 或者set*(copy(.SD))作为最终(该方式缓慢)的方法" msgid "" "cols is a character vector of zero length. Removed the key, but use NULL " @@ -2099,7 +2357,7 @@ msgstr "" "来避免警告" msgid "cols is the empty string. Use NULL to remove the key." -msgstr "列为一个空字符串,请使用NULL以删除键值。" +msgstr "列为一个空字符串,请使用NULL以删除键(key)值。" msgid "cols contains some blanks." msgstr "列中包含空白" @@ -2115,15 +2373,27 @@ msgid "' is type '" msgstr "是类型" msgid "' which is not supported as a key column type, currently." -msgstr "目前不是一种被支持的列类型" +msgstr "目前不是一种被支持的键(key)列类型" msgid "" "Internal error. 'cols' should be character at this point in setkey; please " "report." msgstr "内部错误: 目前在setkey中,'cols'应该是字符类型, 请报告" +msgid "forder took" +msgstr "forder 用了" + +msgid "setkey on columns %s using existing index '%s'" +msgstr "setkey到列%s用现有索引(index) '%s'" + +msgid "reorder took" +msgstr "reorder 用了" + +msgid "x is already ordered by these columns, no need to call reorder" +msgstr "x 已根据这些列进行了排序,无需调用 reorder" + msgid "Internal error: index '" -msgstr "内部错误:索引" +msgstr "内部错误:索引(index) '" msgid "' exists but is invalid" msgstr "存在但无效" @@ -2203,26 +2473,27 @@ msgstr "x 和 y 均需为 data.table" msgid "length(by.x) != length(by.y)" msgstr "length(by.x) != length(by.y)" -msgid "When x's column ('" -msgstr "当 x 的列 ('" - -msgid "') is character, the corresponding column in y ('" -msgstr "') 是字符,y 中相应的列 ('" - -msgid "') should be factor or character, but found incompatible type '" -msgstr "') 应该是因子或字符,然而此类型并不兼容:'" - -msgid "') is factor, the corresponding column in y ('" -msgstr "') 是因子,y 中相应的列 ('" - -msgid "') should be character or factor, but found incompatible type '" -msgstr "') 应该是字符或因子,然而此类型并不兼容:'" +msgid "" +"When x's column ('%s') is character, the corresponding column in y ('%s') " +"should be factor or character, but found incompatible type '%s'." +msgstr "" +"当 x 的列('%s') 是字符,y 中相应的列 ('%s') 应该是因子或字符,然而此类型并不" +"兼容:'%s'." -msgid "') is integer or numeric, the corresponding column in y ('" -msgstr "') 是整数或数值,y 中相应的列 ('" +msgid "" +"When x's column ('%s') is factor, the corresponding column in y ('%s') " +"should be character or factor, but found incompatible type '%s'." +msgstr "" +"当 x 的列('%s') 是因子, y 中相应的列 ('%s') 应该是字符或因子,然而此类型并不" +"兼容:'%s'." -msgid "') can not be character or logical types, but found incompatible type '" -msgstr "') 不能是字符或逻辑类型,然而此类型不兼容:'" +msgid "" +"When x's column ('%s') is integer or numeric, the corresponding column in y " +"('%s') can not be character or logical types, but found incompatible type " +"'%s'." +msgstr "" +"当 x 的列('%s') 是整数或数值,y 中相应的列('%s') 不能是字符或逻辑类型,然而此" +"类型不兼容:'%s'." msgid "argument 'all' should be logical of length one" msgstr "参数 'all' 应该是长度为 1 的逻辑型" @@ -2284,12 +2555,18 @@ msgstr "内部错误:此时不匹配的因子类型应已被发现" msgid "argument 'fill' ignored, only make sense for type='const'" msgstr "参数 'fill' 将被忽略,因其仅当 type='const'时有意义" +msgid "No objects of class data.table exist in %s" +msgstr "%s中没有 data.table类型的对象" + msgid "order.col='" msgstr "order.col='" msgid "' not a column name of info" msgstr "' 并非info的一个列名" +msgid "Total:" +msgstr "共计:" + msgid "data.table package is loaded. Unload or start a fresh R session." msgstr "data.table 包已被加载。请将其卸载或启动一个新的 R 会话。" @@ -2303,27 +2580,40 @@ msgstr "" msgid "Neither %s nor %s exist in %s" msgstr "%3$s 中 %1$s 也 %2$s 不存在" +msgid "test.data.table() running:" +msgstr "test.data.table() 执行:" + +msgid "" +"**** This R session's language is not English. Each test will still check " +"that the correct number of errors and/or\n" +"**** warnings are produced. However, to test the text of each error/warning " +"too, please restart R with LANGUAGE=en" +msgstr "" +"**** 此 R 会话的语言并非英文。每个测试仍将检查生成的警告或错误的个数是否正" +"确。**** 然而,若需同时测试警告和错误的文本内容,请用 LANGUAGE=en 重新启动 " +"R。" + msgid "Failed after test" msgstr "错误出现于测试" msgid "before the next test() call in" msgstr "后,先于下一调用test()于" -msgid "out of" -msgstr "总数为" - -msgid ". Search" -msgstr ". 搜索" - -msgid "for test number" -msgstr "以获得测试编号" - msgid "Timings count mismatch:" msgstr "计时不一致:" msgid "vs" msgstr "vs" +msgid "10 longest running tests took" +msgstr "最慢10个测试用了" + +msgid "All %d tests in %s completed ok in %s" +msgstr "%2$s中每%1$d个测试在%3$s结束了ok" + +msgid "Running test id %s" +msgstr "执行测试 id %s" + msgid "Test" msgstr "测试" @@ -2331,6 +2621,51 @@ msgid "" "is invalid: when error= is provided it does not make sense to pass y as well" msgstr "无效:当使用了error=,不应再输入y" +msgid "Test id %s is not in increasing order" +msgstr "测试标识符 %s 不是递增的顺序" + +msgid "Test %s produced %d %ss but expected %d" +msgstr "测试 %s 生成了%d %ss 但预计生成 %d" + +msgid "" +"Test %s didn't produce the correct %s:\n" +"Expected: %s\n" +"Observed: %s" +msgstr "" +"测试 %s 没有生成正确的 %s:\n" +"预计生成:%s\n" +" 实际生成:%s " + +msgid "Output captured before unexpected warning/error/message:" +msgstr "在意外的警告/错误/提示之前,输入已被记录:" + +msgid "" +"Test %s did not produce the correct output:\n" +"Expected: <<%s>>\n" +"Observed <<%s>>" +msgstr "" +"测试 %s 没有生成正确的输入: \n" +"预计生成: <<%s>>\n" +"实际生成:<<%s>>" + +msgid "" +"Test %s produced output but should not have:\n" +"Expected absent (case insensitive): <<%s>>\n" +"Observed: <<%s>>" +msgstr "" +"测试 %s 生成输出但是不应当出现以下:\n" +"预计不存在(不区分大小写): <<%s>>\n" +"实际生成:<<%s>>" + +msgid "Test %s ran without errors but selfrefok(%s) is FALSE" +msgstr "测试 %s 可以无报错运行但是 selfrefok(%s) 是否:" + +msgid "Test %s ran without errors but failed check that x equals y:" +msgstr "测试 %s 可以无报错运行但是在检查 x 与 y 相同时候有报错:" + +msgid "First %d of %d (type '%s'):" +msgstr "第%d之%d (类型 '%s'):" + msgid "Use started.at=proc.time() not Sys.time() (POSIXt and slow)" msgstr "使用started.at=proc.time()而非Sys.time() (返回POSIXt类型,处理较慢)" @@ -2417,6 +2752,17 @@ msgstr "" msgid "Following columns are not numeric and will be omitted:" msgstr "以下的列并非数值类型,将被忽略:" +msgid "Index: " +msgid_plural "Indices: " +msgstr[0] "索引(index): " + msgid "%d variable not shown: %s\n" msgid_plural "%d variables not shown: %s\n" msgstr[0] "%d变量没显示: %s\n" + +msgid "%d error out of %d. Search %s for test number %s" +msgid_plural "%d errors out of %d. Search %s for test numbers %s" +msgstr[0] "%d错误总数为%d. %s中搜索测试编号%s" + +#~ msgid "'target' and 'current' must both be data.tables" +#~ msgstr "'target' 和 'current' 都必须是 data.table" diff --git a/po/zh_CN.po b/po/zh_CN.po index d9b54a4435..57242f7044 100644 --- a/po/zh_CN.po +++ b/po/zh_CN.po @@ -442,12 +442,12 @@ msgid "" "Dropping index '%s' as it doesn't have '__' at the beginning of its name. It " "was very likely created by v1.9.4 of data.table.\n" msgstr "" -"丢掉索引 '%s' 因为它的名字前面没有 '__' 。这个很可能是 data.table v1.9.4 创建" -"的\n" +"丢掉索引(index) '%s' 因为它的名字前面没有 '__' 。这个很可能由data.table " +"v1.9.4 创建\n" #: assign.c:574 msgid "Internal error: index name ends with trailing __" -msgstr "内部错误: 索引名称以 __ 结尾" +msgstr "内部错误: 索引(index)名称以 __ 结尾" #: assign.c:579 msgid "Internal error: Couldn't allocate memory for s4." @@ -460,12 +460,12 @@ msgstr "内部错误: 不能给 s5 分配内存" #: assign.c:611 assign.c:627 #, c-format msgid "Dropping index '%s' due to an update on a key column\n" -msgstr " 因为一个主列的更新,丢掉索引 '%s'\n" +msgstr " 因为一个键(key)列的更新,丢掉索引(index) '%s'\n" #: assign.c:620 #, c-format msgid "Shortening index '%s' to '%s' due to an update on a key column\n" -msgstr "因为一个主列的更新,缩短索引 '%s' 到 '%s'\n" +msgstr "因为一个键(key)列的更新,缩短索引(index) '%s' 到 '%s'\n" #: assign.c:650 #, c-format diff --git a/src/assign.c b/src/assign.c index 7a326baccc..1602e074b9 100644 --- a/src/assign.c +++ b/src/assign.c @@ -149,45 +149,43 @@ static SEXP shallow(SEXP dt, SEXP cols, R_len_t n) // NEW: cols argument to specify the columns to shallow copy on. If NULL, all columns. // called from alloccol where n is checked carefully, or from shallow() at R level // where n is set to truelength (i.e. a shallow copy only with no size change) - R_len_t i,l; int protecti=0; SEXP newdt = PROTECT(allocVector(VECSXP, n)); protecti++; // to do, use growVector here? SET_ATTRIB(newdt, shallow_duplicate(ATTRIB(dt))); SET_OBJECT(newdt, OBJECT(dt)); IS_S4_OBJECT(dt) ? SET_S4_OBJECT(newdt) : UNSET_S4_OBJECT(newdt); // To support S4 objects that incude data.table //SHALLOW_DUPLICATE_ATTRIB(newdt, dt); // SHALLOW_DUPLICATE_ATTRIB would be a bit neater but is only available from R 3.3.0 - + // TO DO: keepattr() would be faster, but can't because shallow isn't merely a shallow copy. It // also increases truelength. Perhaps make that distinction, then, and split out, but marked // so that the next change knows to duplicate. // keepattr() also merely points to the entire attrbutes list and thus doesn't allow replacing // some of its elements. - + // We copy all attributes that refer to column names so that calling setnames on either // the original or the shallow copy doesn't break anything. SEXP index = PROTECT(getAttrib(dt, sym_index)); protecti++; setAttrib(newdt, sym_index, shallow_duplicate(index)); - + SEXP sorted = PROTECT(getAttrib(dt, sym_sorted)); protecti++; setAttrib(newdt, sym_sorted, duplicate(sorted)); - + SEXP names = PROTECT(getAttrib(dt, R_NamesSymbol)); protecti++; SEXP newnames = PROTECT(allocVector(STRSXP, n)); protecti++; + const int l = isNull(cols) ? LENGTH(dt) : length(cols); if (isNull(cols)) { - l = LENGTH(dt); - for (i=0; i0 but nrow) error(_("i[%d] is %d which is out of range [1,nrow=%d]."),i+1,rowsd[i],nrow); // set() reaches here (test 2005.2); := reaches the same error in subset.c first if (rowsd[i]>=1) numToDo++; @@ -364,13 +362,13 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values) PROTECT(tmp = chmatch(cols, names, 0)); protecti++; buf = (int *) R_alloc(length(cols), sizeof(int)); int k=0; - for (i=0; i0) { if (!isDataTable) error(_("set() on a data.frame is for changing existing columns, not adding new ones. Please use a data.table for that. data.table's are over-allocated and don't shallow copy.")); newcolnames = PROTECT(allocVector(STRSXP, k)); protecti++; - for (i=0; ioldncol+length(newcolnames)) { if (!isDataTable) error(_("Item %d of column numbers in j is %d which is outside range [1,ncol=%d]. set() on a data.frame is for changing existing columns, not adding new ones. Please use a data.table for that."), i+1, coln, oldncol); @@ -436,16 +434,20 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values) } // RHS of assignment to new column is zero length but we'll use its type to create all-NA column of that type } - if (isMatrix(thisvalue) && (j=INTEGER(getAttrib(thisvalue, R_DimSymbol))[1]) > 1) // matrix passes above (considered atomic vector) - warning(_("%d column matrix RHS of := will be treated as one vector"), j); + { + int j; + if (isMatrix(thisvalue) && (j=INTEGER(getAttrib(thisvalue, R_DimSymbol))[1]) > 1) // matrix passes above (considered atomic vector) + warning(_("%d column matrix RHS of := will be treated as one vector"), j); + } const SEXP existing = (coln+1)<=oldncol ? VECTOR_ELT(dt,coln) : R_NilValue; if (isFactor(existing) && !isString(thisvalue) && TYPEOF(thisvalue)!=INTSXP && TYPEOF(thisvalue)!=LGLSXP && !isReal(thisvalue) && !isNewList(thisvalue)) { // !=INTSXP includes factor error(_("Can't assign to column '%s' (type 'factor') a value of type '%s' (not character, factor, integer or numeric)"), CHAR(STRING_ELT(names,coln)),type2char(TYPEOF(thisvalue))); } - if (nrow>0 && targetlen>0 && vlen>1 && vlen!=targetlen && (TYPEOF(existing)!=VECSXP || TYPEOF(thisvalue)==VECSXP)) { - // note that isNewList(R_NilValue) is true so it needs to be TYPEOF(existing)!=VECSXP above + if (nrow>0 && targetlen>0 && vlen>1 && vlen!=targetlen && !(TYPEOF(existing)==VECSXP && targetlen==1)) { + // We allow assigning objects of arbitrary to single items of list columns for convenience. + // Note that isNewList(R_NilValue) is true so it needs to be !(TYPEOF(existing)==VECSXP) above error(_("Supplied %d items to be assigned to %d items of column '%s'. If you wish to 'recycle' the RHS please use rep() to make this intent clear to readers of your code."), vlen, targetlen, CHAR(colnam)); } } @@ -470,11 +472,19 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values) error(_("Internal error: selfrefnames is ok but tl names [%d] != tl [%d]"), TRUELENGTH(names), oldtncol); // # nocov SETLENGTH(dt, oldncol+LENGTH(newcolnames)); SETLENGTH(names, oldncol+LENGTH(newcolnames)); - for (i=0; i=0 ? sourceLen : length(source); + int slen = sourceLen>=0 ? sourceLen : length(source); // since source may get reassigned to a scalar, we should not mark it as const if (slen==0) return NULL; if (sourceStart<0 || sourceStart+slen>length(source)) error(_("Internal error memrecycle: sourceStart=%d sourceLen=%d length(source)=%d"), sourceStart, sourceLen, length(source)); // # nocov @@ -710,7 +720,7 @@ const char *memrecycle(const SEXP target, const SEXP where, const int start, con } else if (!sourceIsFactor && !isString(source)) { // target is factor if (allNA(source, false)) { // return false for list and other types that allNA does not support - source = ScalarLogical(NA_LOGICAL); // a global constant in R and won't allocate; fall through to regular zero-copy coerce + source = ScalarLogical(NA_LOGICAL); slen = 1; // a global constant in R and won't allocate; fall through to regular zero-copy coerce } else if (isInteger(source) || isReal(source)) { // allow assigning level numbers to factor columns; test 425, 426, 429 and 1945 const int nlevel = length(getAttrib(target, R_LevelsSymbol)); @@ -1056,11 +1066,30 @@ const char *memrecycle(const SEXP target, const SEXP where, const int start, con BODY(SEXP, STRING_PTR, SEXP, val, SET_STRING_ELT(target, off+i, cval)) } case VECSXP : - case EXPRSXP : // #546 - if (TYPEOF(source)!=VECSXP && TYPEOF(source)!=EXPRSXP) - BODY(SEXP, &, SEXP, val, SET_VECTOR_ELT(target, off+i, cval)) - else - BODY(SEXP, SEXPPTR_RO, SEXP, val, SET_VECTOR_ELT(target, off+i, cval)) + case EXPRSXP : { // #546 #4350 + if (len == 1 && TYPEOF(source)!=VECSXP && TYPEOF(source)!=EXPRSXP) { + BODY(SEXP, &, SEXP, val, SET_VECTOR_ELT(target, off+i, cval)) + } else { + switch (TYPEOF(source)) { + // allocVector instead of ScalarLogical to avoid copyMostAttrib on R's internal global TRUE/FALSE values; #4595. Then because + // ScalarInteger may now or in future R also return R internal global small integer constants, the same for that. Then + // because we do that here for logical and integer, use allocVeector too for the other types to follow the same pattern and possibly + // in future R will also have some global constants for those types too. + // the UNPROTECT can be at the end of the CAST before the SET_VECTOR_ELT, because SET_VECTOR_ELT will protect it and there's no other code inbetween + // the PROTECT is now needed because of the call to LOGICAL() which could feasibly gc inside it. + // copyMostAttrib is inside CAST so as to be outside loop. See the history in #4350 and its follow up + case RAWSXP: BODY(Rbyte, RAW, SEXP, PROTECT(allocVector(RAWSXP, 1));RAW(cval)[0]=val;copyMostAttrib(source,cval);UNPROTECT(1), SET_VECTOR_ELT(target,off+i,cval)) + case LGLSXP: BODY(int, LOGICAL, SEXP, PROTECT(allocVector(LGLSXP, 1));LOGICAL(cval)[0]=val;copyMostAttrib(source,cval);UNPROTECT(1), SET_VECTOR_ELT(target,off+i,cval)) + case INTSXP: BODY(int, INTEGER, SEXP, PROTECT(allocVector(INTSXP, 1));INTEGER(cval)[0]=val;copyMostAttrib(source,cval);UNPROTECT(1), SET_VECTOR_ELT(target,off+i,cval)) + case REALSXP: BODY(double, REAL, SEXP, PROTECT(allocVector(REALSXP, 1));REAL(cval)[0]=val;copyMostAttrib(source,cval);UNPROTECT(1), SET_VECTOR_ELT(target,off+i,cval)) + case CPLXSXP: BODY(Rcomplex, COMPLEX, SEXP, PROTECT(allocVector(CPLXSXP, 1));COMPLEX(cval)[0]=val;copyMostAttrib(source,cval);UNPROTECT(1), SET_VECTOR_ELT(target,off+i,cval)) + case STRSXP: BODY(SEXP, STRING_PTR, SEXP, PROTECT(allocVector(STRSXP, 1));SET_STRING_ELT(cval, 0, val);copyMostAttrib(source,cval);UNPROTECT(1), SET_VECTOR_ELT(target,off+i,cval)) + case VECSXP: + case EXPRSXP: BODY(SEXP, SEXPPTR_RO, SEXP, val, SET_VECTOR_ELT(target,off+i,cval)) + default: COERCE_ERROR("list"); + } + } + } break; default : error(_("Unsupported column type in assign.c:memrecycle '%s'"), type2char(TYPEOF(target))); // # nocov } diff --git a/src/chmatch.c b/src/chmatch.c index 75e45924de..d7fb90a573 100644 --- a/src/chmatch.c +++ b/src/chmatch.c @@ -80,11 +80,14 @@ static SEXP chmatchMain(SEXP x, SEXP table, int nomatch, bool chin, bool chmatch } int nuniq=0; for (int i=0; i0) { savetl(s); tl=0; } if (tl==0) SET_TRUELENGTH(s, chmatchdup ? -(++nuniq) : -i-1); // first time seen this string in table } + // in future if we need NAs in x not to be matched to NAs in table ... + // if (!matchNAtoNA && TRUELENGTH(NA_STRING)<0) + // SET_TRUELENGTH(NA_STRING, 0); if (chmatchdup) { // chmatchdup() is basically base::pmatch() but without the partial matching part. For example : // chmatchdup(c("a", "a"), c("a", "a")) # 1,2 - the second 'a' in 'x' has a 2nd match in 'table' @@ -113,7 +116,7 @@ static SEXP chmatchMain(SEXP x, SEXP table, int nomatch, bool chin, bool chmatch for (int i=0; i