From f6b91319df2a45728a476d204d809a7b4aec059c Mon Sep 17 00:00:00 2001 From: mattdowle Date: Tue, 22 Jan 2019 19:04:00 -0800 Subject: [PATCH 1/9] Compiles fine without USE_RINTERNALS. First runtime error fixed. --- src/data.table.h | 1 - src/subset.c | 11 ++++++++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/src/data.table.h b/src/data.table.h index 498c6d6f88..20c496fa27 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -1,5 +1,4 @@ #include -#define USE_RINTERNALS #include // #include // the debugging machinery + breakpoint aidee // raise(SIGINT); diff --git a/src/subset.c b/src/subset.c index fc91dbba8f..81d1d78794 100644 --- a/src/subset.c +++ b/src/subset.c @@ -62,11 +62,16 @@ static void subsetVectorRaw(SEXP ans, SEXP source, SEXP idx, const bool anyNA) } } break; case VECSXP : { - SEXP *sp = VECTOR_PTR(source); + // VECTOR_PTR does exist but returns 'not safe to return vector pointer' when USE_RINTERNALS is not defined. + // VECTOR_DATA and LIST_POINTER exist too but call VECTOR_PTR. All are clearly not intended to be used by packages. + // The concern is overhead inside VECTOR_ELT() biting when called repetitively in a loop like we do here. That's why + // we take the R API (INTEGER()[i], REAL()[i], etc) outside loops for the simple types even when not parallel. For this + // type list case (VECSXP) it might be that some items are ALTREP for example, so we really should use the heavier + // _ELT accessor (VECTOR_ELT) inside the loop in this case. if (anyNA) { - for (int i=0; i Date: Tue, 22 Jan 2019 19:33:46 -0800 Subject: [PATCH 2/9] REAL/INTEGER => DATAPTR for now --- src/dogroups.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/src/dogroups.c b/src/dogroups.c index 02033af74e..0f7b86ae4b 100644 --- a/src/dogroups.c +++ b/src/dogroups.c @@ -23,7 +23,7 @@ void setSizes() { SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEXP xjiscols, SEXP grporder, SEXP order, SEXP starts, SEXP lens, SEXP jexp, SEXP env, SEXP lhs, SEXP newnames, SEXP on, SEXP verbose) { - R_len_t i, j, k, rownum, ngrp, nrowgroups, njval=0, ngrpcols, ansloc=0, maxn, estn=-1, r, thisansloc, grpn, thislen, igrp, vlen, origIlen=0, origSDnrow=0; + R_len_t i, j, k, rownum, ngrp, nrowgroups, njval=0, ngrpcols, ansloc=0, maxn, estn=-1, thisansloc, grpn, thislen, igrp, vlen, origIlen=0, origSDnrow=0; int protecti=0; SEXP names, names2, xknames, bynames, dtnames, ans=NULL, jval, thiscol, BY, N, I, GRP, iSD, xSD, rownames, s, RHS, listwrap, target, source, tmp; Rboolean wasvector, firstalloc=FALSE, NullWarnDone=FALSE, recycleWarn=TRUE; @@ -390,11 +390,16 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX for (j=0; j Date: Wed, 23 Jan 2019 19:43:45 -0800 Subject: [PATCH 3/9] modernized memrecycle. recycling length>1 is now error for rubustness. --- NEWS.md | 2 + inst/tests/tests.Rraw | 77 ++++++++++----------- src/assign.c | 151 +++++++++++++++++++++++------------------- src/dogroups.c | 78 ++++++++++++---------- src/reorder.c | 14 ++-- 5 files changed, 175 insertions(+), 147 deletions(-) diff --git a/NEWS.md b/NEWS.md index fd01eca538..b5e2c25af8 100644 --- a/NEWS.md +++ b/NEWS.md @@ -4,6 +4,8 @@ #### NEW FEATURES +1. `:=` no longer recycles too-short RHS vectors to match the length on the LHS. Like base R, there was a warning when recycling left a remainder but no warning when the LHS length was an exact multiple of the RHS length. All cases are now an error unless `length(RHS)==1` or `length(RHS)==length(LHS)`. Consistent feedback was that recycling is rarely useful and more often a bug. In rare cases where you need to recycle a length>1 vector, use `rep()` explicitly. Single values are still recycled silently as before. + #### BUG FIXES #### NOTES diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 8bfa6c8673..b881644d95 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -1373,7 +1373,9 @@ x = list(2,"b",2.718) test(470, DT[,baz:=x], data.table(a=1:3,b=4:6,foo=list(),bar=list(1,"a",3.14),baz=list(2,"b",2.718))) # Test recycling list DT = data.table(a=1:4,b=5:8) -test(471, DT[,foo:=list("a",2:3)], data.table(a=1:4,b=5:8,foo=list("a",2:3,"a",2:3))) +test(471.1, DT[,foo:=list("a",2:3)], error="Supplied 2 items to be assigned to 4 items of column 'foo'.*recycle") +test(471.2, names(DT), c("a","b")) # mismatch length error was caught ok before adding column not after column added +test(471.3, DT[,foo:=.(list("a"))], data.table(a=1:4,b=5:8,foo=list("a","a","a","a"))) # Test recycling singleton list if (ncol(DT)==3L) DT[,foo:=NULL] # else don't warn here under torture with skip= such that test 471 didn't run test(472, DT[,foo:=list(list(2:3))], data.table(a=1:4,b=5:8,foo=list(2:3,2:3,2:3,2:3))) @@ -1381,9 +1383,9 @@ test(472, DT[,foo:=list(list(2:3))], data.table(a=1:4,b=5:8,foo=list(2:3,2:3,2:3 # Test adding new column with a recycled factor, #1691 DT = data.table(a=1:4,b=5:8) DT[,c:=factor("a")] -test(473, DT, data.table(a=1:4,b=5:8,c=factor(c("a","a","a","a")))) -DT[,d:=factor(c("a","b"))] -test(474, DT, data.table(a=1:4,b=5:8,c=factor(c("a","a","a","a")),d=factor(c("a","b","a","b")))) +test(473.1, DT, data.table(a=1:4,b=5:8,c=factor(c("a","a","a","a")))) +test(473.2, DT[,d:=factor(c("a","b"))], error="Supplied 2 items to be assigned to 4 items of column 'd'") +test(474, DT[,d:=factor(c("X"))], data.table(a=1:4,b=5:8,c=factor(c("a","a","a","a")),d=factor(c("X","X","X","X")))) # Test scoping error introduced at 1.6.1, unique(DT) when key column is 'x' DT=data.table(x=c("a", "a", "b", "b"), y=c("a", "a", "b", "b"), key="x") @@ -1410,9 +1412,9 @@ gc() test(477, DT, data.table(a=1:3,foo=c(4L,10L,6L),foo2=c(4L,5L,11L))) test(478, DT[,foo:=foo], DT) # does nothing, with no warning, consistent with base R `a<-a`. -# Test that recycling now works with oversized inputs and % != 0 length, both with warnings. +# Old tests that recycling now works now moved to test error DT = data.table(x=1:4) -test(479, DT[, a:=5:7], data.table(x=1:4,a=c(5:7,5L)), warning="Supplied 3 items to be assigned to 4 items of column 'a' (recycled leaving remainder of 1 items)") +test(479, DT[, a:=5:7], error="Supplied 3 items to be assigned to 4 items of column 'a'") # Test that multiple columns can be added DT = data.table(x=1:4) @@ -2070,8 +2072,8 @@ test(745, DT[,ncol(DT):=1L], data.table(c1=1:3,c2=99L,c3=1L)) DT = data.table(a=letters[c(1:3,3L)],key="a") test(746, DT["a",c("new1","new2"):=list(4L, 5L)], data.table(a=letters[c(1:3,3L)],new1=INT(4,NA,NA,NA),new2=INT(5,NA,NA,NA),key="a")) -test(747, DT[,new1:=4:6], data.table(a=letters[c(1:3,3L)],new1=INT(4L,5L,6L,4L),new2=INT(5,NA,NA,NA),key="a"), warning="recycled leaving remainder of 1 item") -suppressWarnings(DT[,new1:=4:6]) +test(747.1, DT[,new1:=4:6], error="Supplied 3 items to be assigned to 4 items of column 'new1'") +test(747.2, DT[,new1:=INT(4,5,6,4)], data.table(a=letters[c(1:3,3L)],new1=INT(4L,5L,6L,4L),new2=INT(5,NA,NA,NA),key="a")) test(748, DT[c("c","b"),`:=`(new3=.N,new2=sum(new1)+1L),by=.EACHI], data.table(a=letters[c(1:3,3L)],new1=INT(4,5,6,4),new2=INT(5,6,11,11),new3=INT(NA,1,2,2),key="a")) # and multiple LHS by group, #1710 @@ -2142,12 +2144,10 @@ test(770, DT[J(2:3),.BY[[1]]==b,by=.EACHI], data.table(a=INT(2,2,3,3),V1=c(TRUE, # A data.table RHS of := caused a crash, #2311. a = data.table(first=1:6, third=c(1,1,1,3,3,4), key="first") b = data.table(first=c(3,4,4,5,6,7,8), second=1:7, key="first") -test(771, b[,third:=a[b,third,by=.EACHI]], b, warning="Supplied 2 items.*to 7.*recycled leaving remainder of 1 item") -test(772, copy(b)[,third:=as.list(a[b,third,by=.EACHI])], b, warning="Supplied 2 items.*to 7.*recycled leaving remainder of 1 item") -test(773, b[4,third[[1]]], c(1,3,3,3,4,NA,NA)) -test(774.1, b[,third:=a[b,third,mult="first"]], ans<-data.table(first=c(3,4,4,5,6,7,8), second=1:7, third=c(1,3,3,3,4,NA,NA), key="first")) -test(774.2, b[,third:=a[b,third]], ans) # mult="first" no longer needed as from v1.9.3. It now does what was naturally expected. - +test(771, b[,third:=a[b,third,by=.EACHI]], error="Supplied 2 items to be assigned to 7 items of column 'third'") +test(772, b[,third:=as.list(a[b,third,by=.EACHI])], error="Supplied 2 items to be assigned to 7 items of column 'third'") +test(773, b[,third:=a[b,third,mult="first"]], ans<-data.table(first=c(3,4,4,5,6,7,8), second=1:7, third=c(1,3,3,3,4,NA,NA), key="first")) +test(774, b[,third:=a[b,third]], ans) # mult="first" no longer needed as from v1.9.3. It now does what was naturally expected. # That names are dropped. (Names on the column vectors don't display. They increase size and aren't much use.) DT = data.table(a=1:3,b=LETTERS[1:3]) @@ -2302,10 +2302,9 @@ DT = data.table(a=1:3) DT[,a:=scale(a)] # 1 column matrix auto treated as vector test(835, na.omit(DT), DT) test(836, DT[,a:=as.integer(a)], data.table(a=INT(-1,0,1))) -test(837, DT[,a:=cbind(1,2)], data.table(a=c(1L,2L,1L)), - warning=c("2 column matrix RHS of := will be treated as one vector", - "Supplied 2 items to be assigned to 3 items.*recycled", - "Coerced double RHS to integer to match.*column 1 named 'a'.*values contain no fractions")) +test(837, DT[,a:=cbind(1,2)], + warning = "2 column matrix RHS of := will be treated as one vector", + error = "Supplied 2 items to be assigned to 3 items of column 'a'") DT = data.table(a=1:3,b=1:6) test(838, DT[,c:=scale(b), by=a][,c:=as.integer(1000*c)], data.table(a=1:3,b=1:6,c=rep(as.integer(1000*scale(1:2)), each=3))) @@ -2570,7 +2569,9 @@ test(915, fread(txt,sep="\n"), data.table("A;B;C|D,E"=c("1;3;4|5,6","2;4;6|8,10" # Crash bug when RHS is 0 length and := by group, fixed in 1.8.7 DT = data.table(a=1:3,b=1:6) -test(916, DT[,newcol:=logical(0),by=a], data.table(a=1:3,b=1:6,newcol=NA)) +test(916.1, DT[,newcol:=logical(0),by=a], error="Supplied 0 items to be assigned to group 1 of size 2 in column 'newcol'") +test(916.2, DT, data.table(a=1:3,b=1:6)) # that newcol was not added since group 1 failed +test(916.3, DT[,newcol:=NA,by=a], data.table(a=1:3,b=1:6,newcol=NA)) # roll join error when non last join column is factor, #2450 X = data.table(id=2001:2004, uid=c(1001,1002,1001,1001), state=factor(c('CA','CA','CA','MA')), ts=c(51,52,53,54), key='state,uid,ts') @@ -2721,10 +2722,7 @@ setkey(dt2, "Date") test(980, dt1[dt2, `:=`(A=A+i.A, B=B+i.B, C=i.C)][,list(A,B,C)], data.table(A=INT(110,115,120,225,230,235),B=INT(330,325,320,415,410,405),C=rep(1:2,each=3))) DT = data.table(A=1:2,B=3:4,C=5:6) -test(981, DT[,`:=`(D=B+4L,B=0:1,E=A*2L,F=A*3L,C=C+1L,G=C*2L),by=A], - data.table(A=1:2,B=0L,C=6:7,D=7:8,E=c(2L,4L),F=c(3L,6L),G=c(10L,12L)), - warning=c("RHS 2 is length 2.*group 1.*last 1 element.*discarded", - "RHS 2 is length 2.*group 2.*last 1 element.*discarded")) +test(981, DT[,`:=`(D=B+4L,B=0:1,E=A*2L,F=A*3L,C=C+1L,G=C*2L),by=A], error="Supplied 2 items to be assigned to group 1 of size 1 in column 'B'") DT = data.table(A=1:2,B=3:4,C=5:6) test(982, DT[,`:=`(D=B+4L,B=0L,E=A*2L,F=A*3L,C=C+1L,G=C*2L),by=A], data.table(A=1:2,B=0L,C=6:7,D=7:8,E=c(2L,4L),F=c(3L,6L),G=c(10L,12L))) # Also note that G is not yet iterative. In future: c(12,14) @@ -3540,12 +3538,18 @@ test(1131, My_Fun(), DT) # Test for #4957 - where `j` doesn't know `.N` when used with `lapply(.SD, function(x) ...)` test(1132, DT[, lapply(.SD, function(x) .N), by=ID], data.table(ID=c("A", "B", "C"), Value1=2L, Value2=2L)) -# Test for #4990 - `:=` does not generate recycling warning during 'by': -DT <- data.table(x=c(1,1,1,1,1,2,2)) +# Test for #4990 - `:=` recycle error during 'by' +DT <- data.table(x=INT(1,1,1,1,1,2,2)) # on a new column -test(1133.1, DT[, new := c(1,2), by=x], data.table(x=c(1,1,1,1,1,2,2), new=c(1,2,1,2,1,1,2)), warning="Supplied 2 items to be assigned to group 1 of size 5 in column 'new'") -# on an already existing column -test(1133.2, DT[, new := c(1,2), by=x], data.table(x=c(1,1,1,1,1,2,2), new=c(1,2,1,2,1,1,2)), warning="Supplied 2 items to be assigned to group 1 of size 5 in column 'new'") +test(1133.1, DT[, new := c(1,2), by=x], error="Supplied 2 items to be assigned to group 1 of size 5 in column 'new'") +test(1133.2, DT, data.table(x=INT(1,1,1,1,1,2,2))) +# on an already existing column: +DT[, new:=99L] +test(1133.3, DT[, new := c(1,2), by=x], error="Type of RHS ('double') must match LHS ('integer')") +test(1133.4, DT[, new := c(1L,2L), by=x], error="Supplied 2 items to be assigned to group 1 of size 5 in column 'new'") +test(1133.5, DT, data.table(x=INT(1,1,1,1,1,2,2), new=99L)) +test(1133.6, DT[, new := rep(-.GRP, .N), by=x], data.table(x=INT(1,1,1,1,1,2,2), new=INT(-1,-1,-1,-1,-1,-2,-2))) +test(1133.7, DT[, new := .N, by=x], data.table(x=INT(1,1,1,1,1,2,2), new=INT(5,5,5,5,5,2,2))) # Fix for FR #2496 - catch `{` in `:=` expression in `j`: DT <- data.table(x=c("A", "A", "B", "B"), val =1:4) @@ -3983,9 +3987,8 @@ DT = CJ(c("Corp", "CORP"), 1:3) test(1190, setkey(DT), DT) # tests no warning here from setkey, was "key rebuilt" due to inconsistent locale sorting in v1.8.10 # non-exact recycling in j results. Was caught with error in v1.8.10, now recycles with remainder and warning -DT = data.table(a=1:2,b=1:6) -test(1191, DT[, list(b,1:2), by=a], data.table(a=INT(1,1,1,2,2,2),b=INT(1,3,5,2,4,6),V2=INT(1,2,1,1,2,1)), - warning="Recycled leaving remainder of 1 item.*This warning is once only") +DT = data.table(a=1:2, b=1:6) +test(1191, DT[, list(b,1:2), by=a], error="Supplied 2 items for column 2 of group 1 which has 3 rows.") # that twiddle is used consistently, and tolerance has gone. # nice example from : http://stackoverflow.com/questions/21885290/data-table-roll-nearest-returns-multiple-results @@ -4644,13 +4647,11 @@ test(1288.16, rbindlist(ll, fill=TRUE), error="fill=TRUE, but names of input lis # fix for #5647 dt = data.table(x=1L, y=1:10) -cp = copy(dt) -test(1289.1, dt[,z := c(rep(NA, 5), y), by=x], cp[, z := c(rep(NA, 5), y[1:5])], warning="RHS 1 is length 15") -dt = data.table(x=c(1:2), y=1:10) -cp = copy(dt) -test(1289.2, dt[, z := c(rep(NA, 5),y), by=x], cp[, z := rep(NA_integer_, 10)], - warning=c("RHS 1 is length 10.*group 1.*last 5 element.*discarded", - "RHS 1 is length 10.*group 2.*last 5 element.*discarded")) +test(1289.1, dt[, z := c(rep(NA,5), y), by=x], error="Supplied 15 items to be assigned to group 1 of size 10 in column 'z'") +test(1289.2, names(dt), c("x","y")) +dt = data.table(x=1:2, y=1:10) +test(1289.3, dt[, z := c(rep(NA,5), y), by=x], error="Supplied 10 items to be assigned to group 1 of size 5 in column 'z'") +test(1289.4, names(dt), c("x","y")) ######################################## # Extensve testing for "duplicate" names diff --git a/src/assign.c b/src/assign.c index 8969d17685..1eb04d5f95 100644 --- a/src/assign.c +++ b/src/assign.c @@ -443,11 +443,8 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values, SEXP v if ((coln+1)<=oldncol && isFactor(VECTOR_ELT(dt,coln)) && !isString(thisvalue) && TYPEOF(thisvalue)!=INTSXP && TYPEOF(thisvalue)!=LGLSXP && !isReal(thisvalue) && !isNewList(thisvalue)) // !=INTSXP includes factor error("Can't assign to column '%s' (type 'factor') a value of type '%s' (not character, factor, integer or numeric)", CHAR(STRING_ELT(names,coln)),type2char(TYPEOF(thisvalue))); - if (nrow>0 && targetlen>0) { - if (vlen>targetlen) - warning("Supplied %d items to be assigned to %d items of column '%s' (%d unused)", vlen, targetlen,CHAR(colnam),vlen-targetlen); - else if (vlen>0 && targetlen%vlen != 0) - warning("Supplied %d items to be assigned to %d items of column '%s' (recycled leaving remainder of %d items).",vlen,targetlen,CHAR(colnam),targetlen%vlen); + if (nrow>0 && targetlen>0 && vlen>1 && vlen!=targetlen) { + error("Supplied %d items to be assigned to %d items of column '%s'. The RHS length must either be 1 (single values are ok) or match the LHS length exactly. If you wish to 'recycle' the RHS please use rep() explicitly to make this intent clear to readers of your code.", vlen, targetlen,CHAR(colnam)); } } // having now checked the inputs, from this point there should be no errors so we can now proceed to @@ -810,23 +807,30 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values, SEXP v return(dt); // needed for `*tmp*` mechanism (when := isn't used), and to return the new object after a := for compound syntax. } +/* static Rboolean anyNamed(SEXP x) { if (MAYBE_REFERENCED(x)) return TRUE; if (isNewList(x)) for (int i=0; i len ? len : length(source); // fix for 5647. when length(source) > len, slen must be len. - if (slen<1) return; if (TYPEOF(target) != TYPEOF(source)) error("Internal error: TYPEOF(target)['%s']!=TYPEOF(source)['%s']", type2char(TYPEOF(target)),type2char(TYPEOF(source))); // # nocov + int slen = length(source); + if (slen!=1 && slen!=len) error("Internal error: recycle length error not caught earlier. slen=%d len=%d", slen, len); // # nocov + // Internal error because the column has already been added to the DT, so length mismatch should have been caught before adding the column. + // for 5647 this used to limit slen to len, but no longer + + /* + TODELETE ... if (isNewList(source)) { // A list() column; i.e. target is a column of pointers to SEXPs rather than the much more common case // where memrecycle copies the DATAPTR data to the atomic target from the atomic source. @@ -845,82 +849,93 @@ void memrecycle(SEXP target, SEXP where, int start, int len, SEXP source) protecti++; } } + */ size_t size = SIZEOF(target); if (!length(where)) { switch (TYPEOF(target)) { - case INTSXP : case REALSXP : case LGLSXP : - break; - case STRSXP : - for (; r0?1:0; r<(len/slen); r++) { // if the first slen were done in the switch above, convert r=slen to r=1 - memcpy((char *)DATAPTR(target) + (start+r*slen)*size, - (char *)DATAPTR(source), - slen * size); + case REALSXP : + if (slen==1) { + double *td = REAL(target); + const double val = REAL(source)[0]; + for (int i=0; i10 it may be worth memcpy, but we'd need to first know if 'where' was a contiguous subset } - UNPROTECT(protecti); + // DELETE ... UNPROTECT(protecti); } SEXP allocNAVector(SEXPTYPE type, R_len_t n) diff --git a/src/dogroups.c b/src/dogroups.c index 0f7b86ae4b..ad85fe9c96 100644 --- a/src/dogroups.c +++ b/src/dogroups.c @@ -23,11 +23,10 @@ void setSizes() { SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEXP xjiscols, SEXP grporder, SEXP order, SEXP starts, SEXP lens, SEXP jexp, SEXP env, SEXP lhs, SEXP newnames, SEXP on, SEXP verbose) { - R_len_t i, j, k, rownum, ngrp, nrowgroups, njval=0, ngrpcols, ansloc=0, maxn, estn=-1, thisansloc, grpn, thislen, igrp, vlen, origIlen=0, origSDnrow=0; + R_len_t i, j, k, rownum, ngrp, nrowgroups, njval=0, ngrpcols, ansloc=0, maxn, estn=-1, thisansloc, grpn, thislen, igrp, origIlen=0, origSDnrow=0; int protecti=0; SEXP names, names2, xknames, bynames, dtnames, ans=NULL, jval, thiscol, BY, N, I, GRP, iSD, xSD, rownames, s, RHS, listwrap, target, source, tmp; - Rboolean wasvector, firstalloc=FALSE, NullWarnDone=FALSE, recycleWarn=TRUE; - size_t size; // must be size_t, otherwise bug #5305 (integer overflow in memcpy) + Rboolean wasvector, firstalloc=FALSE, NullWarnDone=FALSE; clock_t tstart=0, tblock[10]={0}; int nblock[10]={0}; if (!isInteger(order)) error("Internal error: order not integer vector"); // # nocov @@ -132,7 +131,7 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX INTEGER(GRP)[0] = i+1; // group counter exposed as .GRP for (j=0; j=0 && nrowgroups) for (j=0; j=0) { for (j=0; j0 && vlen>grpn && j0 && grpn%vlen != 0) - warning("Supplied %d items to be assigned to group %d of size %d in column '%s' (recycled leaving remainder of %d items).",vlen,i+1,grpn,CHAR(STRING_ELT(dtnames,INTEGER(lhs)[j]-1)),grpn%vlen); - - memrecycle(target, order, INTEGER(starts)[i]-1, grpn, RHS); - + memrecycle(target, order, INTEGER(starts)[i]-1, grpn, RHS); // length mismatch checked above for all jval columns before starting to add any new columns copyMostAttrib(RHS, target); // not names, otherwise test 778 would fail. /* OLD FIX: commented now. The fix below resulted in segfault on factor columns because I dint set the "levels" Instead of fixing that, I just removed setting class if it's factor. Not appropriate fix. @@ -316,7 +324,6 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX OUTDATED: if (!isFactor(RHS)) setAttrib(target, R_ClassSymbol, getAttrib(RHS, R_ClassSymbol)); OUTDATED: // added !isFactor(RHS) to fix #5104 (side-effect of fixing #2531) See also #155 and #36 */ - } UNPROTECT(1); continue; @@ -415,12 +422,14 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX } switch (TYPEOF(target)) { // rarely called so no need to optimize this switch case LGLSXP : - case INTSXP : - for (int r=0; r Date: Wed, 23 Jan 2019 21:57:55 -0800 Subject: [PATCH 4/9] Passes 'R CMD check' again on R 3.5.2. Probably not on R 3.1.0 yet. --- inst/tests/tests.Rraw | 21 ++++++++------------- src/assign.c | 24 +++++++++--------------- 2 files changed, 17 insertions(+), 28 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index b881644d95..84bed528fd 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -5291,21 +5291,16 @@ test(1364.24, setdiff_(X, Y[0L], by.x = 'a'), c = c("1", "3", "2"), d = c(1L, 3L, 2L), e = c(1L, 5L, 7L))) # not join along with by=.EACHI, #604 -DT <- data.table(A=c(1,1,1,2,2,2,2,3,3,4,5,5))[, `:=`(B=as.integer(A), C=c("c", "e", "a", "d"), D=factor(c("c", "e", "a", "d")), E=1:12)] +DT <- data.table(A=c(1,1,1,2,2,2,2,3,3,4,5,5))[, `:=`(B=as.integer(A), C=rep(c("c", "e", "a", "d"),3L), D=factor(rep(c("c", "e", "a", "d"),3L)), E=1:12)] setkey(DT, A) -test(1365.1, suppressMessages(DT[!J(c(2,5)), sum(E), by=.EACHI]), - suppressMessages(DT[J(c(1,3,4)), sum(E), by=.EACHI])) +test(1365.1, DT[!J(c(2,5)), sum(E), by=.EACHI], DT[J(c(1,3,4)), sum(E), by=.EACHI]) setkey(DT, B) -test(1365.2, suppressMessages(DT[!J(c(4:5)), list(.N, sum(E)), by=.EACHI]), - suppressMessages(DT[J(1:3), list(.N, sum(E)), by=.EACHI])) +test(1365.2, DT[!J(c(4:5)), list(.N, sum(E)), by=.EACHI], DT[J(1:3), list(.N, sum(E)), by=.EACHI]) setkey(DT, C) -test(1365.3, suppressMessages(copy(DT)[!"c", f := .N, by=.EACHI]), - suppressMessages(copy(DT)[c("a", "d", "e"), f := .N, by=.EACHI])) +test(1365.3, copy(DT)[!"c", f:=.N, by=.EACHI], copy(DT)[c("a","d","e"), f:=.N, by=.EACHI]) setkey(DT, D) -test(1365.4, suppressMessages(DT[!J(factor("c")), .N, by=.EACHI]), - suppressMessages(DT[J(factor(c("a", "d", "e"))), .N, by=.EACHI])) -test(1365.5, suppressMessages(DT[!"c", lapply(.SD, sum), by=.EACHI, .SDcols=c("B", "E")]), - suppressMessages(DT[c("a", "d", "e"), lapply(.SD, sum), by=.EACHI, .SDcols=c("B", "E")])) +test(1365.4, DT[!J(factor("c")), .N, by=.EACHI], DT[J(factor(c("a","d","e"))), .N, by=.EACHI]) +test(1365.5, DT[!"c", lapply(.SD, sum), by=.EACHI, .SDcols=c("B","E")], DT[c("a","d","e"), lapply(.SD, sum), by=.EACHI, .SDcols=c("B", "E")]) # uniqlengths doesn't error on 0-length input test(1366, uniqlengths(integer(0), 0L), integer(0)) @@ -8527,7 +8522,7 @@ setindex(dt, b) dt[, a := as.logical(sum(a)), by = b] test(1632.2, names(attributes(attr(dt, 'index'))), "__b") dt = data.table(a = rep(TFvec, 3), b = c("x", "y", "z")) -test(1632.3, copy(dt)[, c := !a, by=b], copy(dt)[, c := !TFvec]) +test(1632.3, copy(dt)[, c := !a, by=b], copy(dt)[, c := rep(!TFvec,3L)]) # by accepts colA:colB for interactive scenarios, #1395 dt = data.table(x=rep(1,18), y=rep(1:2, each=9), z=rep(1:3,each=6), a=rep(1:6, each=3))[, b := 6] @@ -12344,7 +12339,7 @@ test(1919, as.ITime(c('xxx', '10:43')), structure(c(NA, 38580L), class = "ITime" # wrong bmerge result if character gets coerced to factor, i is keyed, and level order in i is different from x, #2881 iris = data.table(iris) -iris$grp = c('A', 'B') +iris$grp = rep(c('A','B'), 75L) iris[, Species1 := factor(Species, levels=c('setosa','versicolor','virginica'), labels=c('setosa','versicolor','Virginica'))] iSorted = data.table(Species1 = c('setosa','Virginica'), grp='B', key=c("grp","Species1")) i = setkey(copy(iSorted),NULL) diff --git a/src/assign.c b/src/assign.c index 1eb04d5f95..3e323c99f5 100644 --- a/src/assign.c +++ b/src/assign.c @@ -807,21 +807,18 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values, SEXP v return(dt); // needed for `*tmp*` mechanism (when := isn't used), and to return the new object after a := for compound syntax. } -/* -static Rboolean anyNamed(SEXP x) { - if (MAYBE_REFERENCED(x)) return TRUE; +static bool anyNamed(SEXP x) { + if (MAYBE_REFERENCED(x)) return true; if (isNewList(x)) for (int i=0; i Date: Wed, 23 Jan 2019 22:53:00 -0800 Subject: [PATCH 5/9] news item tweak --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index b5e2c25af8..902d2d8110 100644 --- a/NEWS.md +++ b/NEWS.md @@ -4,7 +4,7 @@ #### NEW FEATURES -1. `:=` no longer recycles too-short RHS vectors to match the length on the LHS. Like base R, there was a warning when recycling left a remainder but no warning when the LHS length was an exact multiple of the RHS length. All cases are now an error unless `length(RHS)==1` or `length(RHS)==length(LHS)`. Consistent feedback was that recycling is rarely useful and more often a bug. In rare cases where you need to recycle a length>1 vector, use `rep()` explicitly. Single values are still recycled silently as before. +1. `:=` no longer recycles too-short RHS vectors to match the LHS length. There was a warning when recycling left a remainder but no warning when the LHS length was an exact multiple of the RHS length (the same behaviour as base R). All recycling is now an error unless `length(RHS)==1` or `length(RHS)==length(LHS)`. Consistent feedback was that recycling is rarely useful and more often a bug. In rare cases where you need to recycle a length>1 vector, use `rep()` explicitly. Single values are still recycled silently as before. #### BUG FIXES From 56f72904ddf3f5b6fa548831ebc923de02cf4981 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Thu, 24 Jan 2019 15:53:05 -0800 Subject: [PATCH 6/9] CRAN_Release.cmd only --- CRAN_Release.cmd | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/CRAN_Release.cmd b/CRAN_Release.cmd index 38c19ea24b..a1927b554e 100644 --- a/CRAN_Release.cmd +++ b/CRAN_Release.cmd @@ -106,15 +106,15 @@ test.data.table() install.packages("xml2") # to check the 150 URLs in NEWS.md under --as-cran below q("no") R CMD build . -R CMD check data.table_1.11.9.tar.gz --as-cran -R CMD INSTALL data.table_1.11.9.tar.gz +R CMD check data.table_1.12.1.tar.gz --as-cran +R CMD INSTALL data.table_1.12.1.tar.gz # Test C locale doesn't break test suite (#2771) echo LC_ALL=C > ~/.Renviron R Sys.getlocale()=="C" q("no") -R CMD check data.table_1.11.9.tar.gz +R CMD check data.table_1.12.1.tar.gz rm ~/.Renviron R @@ -145,7 +145,7 @@ alias R310=~/build/R-3.1.0/bin/R ### END ONE TIME BUILD cd ~/GitHub/data.table -R310 CMD INSTALL ./data.table_1.11.9.tar.gz +R310 CMD INSTALL ./data.table_1.12.1.tar.gz R310 require(data.table) test.data.table() @@ -157,7 +157,7 @@ test.data.table() vi ~/.R/Makevars # Make line SHLIB_OPENMP_CFLAGS= active to remove -fopenmp R CMD build . -R CMD INSTALL data.table_1.11.9.tar.gz # ensure that -fopenmp is missing and there are no warnings +R CMD INSTALL data.table_1.12.1.tar.gz # ensure that -fopenmp is missing and there are no warnings R require(data.table) # observe startup message about no OpenMP detected test.data.table() @@ -165,7 +165,7 @@ q("no") vi ~/.R/Makevars # revert change above R CMD build . -R CMD check data.table_1.11.9.tar.gz +R CMD check data.table_1.12.1.tar.gz ##################################################### # R-devel with UBSAN, ASAN and strict-barrier on too @@ -194,7 +194,7 @@ cd R-devel-strict # important to change directory name before building not af make alias Rdevel-strict='~/build/R-devel-strict/bin/R --vanilla' cd ~/GitHub/data.table -Rdevel-strict CMD INSTALL data.table_1.11.9.tar.gz +Rdevel-strict CMD INSTALL data.table_1.12.1.tar.gz # Check UBSAN and ASAN flags appear in compiler output above. Rdevel was compiled with them so should be passed through to here Rdevel-strict install.packages(c("bit64","xts","nanotime","R.utils"), repos="http://cloud.r-project.org") # minimum packages needed to not skip any tests in test.data.table() @@ -230,7 +230,7 @@ cd R-devel make cd ~/GitHub/data.table vi ~/.R/Makevars # make the -O0 -g line active, for info on source lines with any problems -Rdevel CMD INSTALL data.table_1.11.9.tar.gz +Rdevel CMD INSTALL data.table_1.12.1.tar.gz Rdevel -d "valgrind --tool=memcheck --leak-check=full --track-origins=yes --show-leak-kinds=definite" # gctorture(TRUE) # very slow, many days # gctorture2(step=100) @@ -272,7 +272,7 @@ There are some things to overcome to achieve compile without USE_RINTERNALS, tho ## Rdevel ## install.packages("bit64") ## q("no") -## Rdevel CMD INSTALL ~/data.table_1.11.9.tar.gz +## Rdevel CMD INSTALL ~/data.table_1.12.1.tar.gz ## Rdevel ## .Machine$sizeof.longdouble # check 0 ## require(data.table) @@ -287,7 +287,7 @@ cd ~/build/rchk/trunk . ../scripts/config.inc . ../scripts/cmpconfig.inc # edit ~/.R/Makevars to set CFLAGS=-O0 -g so that rchk can provide source line numbers -echo 'install.packages("~/GitHub/data.table/data.table_1.11.9.tar.gz",repos=NULL)' | ./bin/R --slave +echo 'install.packages("~/GitHub/data.table/data.table_1.12.1.tar.gz",repos=NULL)' | ./bin/R --slave ../scripts/check_package.sh data.table cat packages/lib/data.table/libs/*check # keep running and rerunning locally until all problems cease. @@ -413,8 +413,7 @@ BiocManager::valid() avail = available.packages(repos=BiocManager::repositories()) # includes CRAN at the end from getOption("repos"). And ensure latest Bioc version is in repo path here. deps = tools::package_dependencies("data.table", db=avail, which="most", reverse=TRUE, recursive=FALSE)[[1]] -exclude = c("TCGAbiolinks", # too long (>30mins): https://github.com/BioinformaticsFMRP/TCGAbiolinks/issues/240 - "ctsem") # too long (>30mins) see Ttotal on CRAN checks and warnings/errors: https://cran.r-project.org/web/checks/check_results_ctsem.html +exclude = c("TCGAbiolinks") # too long (>30mins): https://github.com/BioinformaticsFMRP/TCGAbiolinks/issues/240 deps = deps[-match(exclude, deps)] table(avail[deps,"Repository"]) length(deps) @@ -552,7 +551,7 @@ run = function(which=c("not.started","cran.fail","bioc.fail","both.fail","rerun. } # ** ensure latest version installed into revdeplib ** -system("R CMD INSTALL ~/GitHub/data.table/data.table_1.11.9.tar.gz") +system("R CMD INSTALL ~/GitHub/data.table/data.table_1.12.1.tar.gz") run("rerun.all") out = function(fnam="~/fail.log") { @@ -561,10 +560,9 @@ out = function(fnam="~/fail.log") { cat(paste(x,collapse=" "), "\n") cat(capture.output(sessionInfo()), "\n", file=fnam, sep="\n") cat("> BiocManager::install()\n", file=fnam, append=TRUE) - capture.output(BiocManager::install(), type="message", file=fnam, append=TRUE) + cat(capture.output(BiocManager::install(), type="message"), file=fnam, sep="\n", append=TRUE) cat("> BiocManager::valid()\n", file=fnam, append=TRUE) - capture.output(ans<-BiocManager::valid(), type="message", file=fnam, append=TRUE) - cat(ans, "\n", file=fnam, append=TRUE, sep="\n") + cat(isTRUE(BiocManager::valid()), "\n\n\n", file=fnam, append=TRUE) for (i in x) { system(paste0("ls | grep '",i,".*tar.gz' >> ",fnam)) system(paste0("grep -H . ./",i,".Rcheck/00check.log >> ",fnam)) From 70ab25df880d55f8f59ef7d6f98630b43b6248a0 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Thu, 24 Jan 2019 17:29:09 -0800 Subject: [PATCH 7/9] news item: better focus on length>1 vectors --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 902d2d8110..9daf1abd90 100644 --- a/NEWS.md +++ b/NEWS.md @@ -4,7 +4,7 @@ #### NEW FEATURES -1. `:=` no longer recycles too-short RHS vectors to match the LHS length. There was a warning when recycling left a remainder but no warning when the LHS length was an exact multiple of the RHS length (the same behaviour as base R). All recycling is now an error unless `length(RHS)==1` or `length(RHS)==length(LHS)`. Consistent feedback was that recycling is rarely useful and more often a bug. In rare cases where you need to recycle a length>1 vector, use `rep()` explicitly. Single values are still recycled silently as before. +1. `:=` no longer recycles length>1 RHS vectors. There was a warning when recycling left a remainder but no warning when the LHS length was an exact multiple of the RHS length (the same behaviour as base R). Consistent feedback for several years has been that recycling is more often a bug. In rare cases where you need to recycle a length>1 vector, use `rep()` explicitly. Single values are still recycled silently as before. Early warning was given in [this tweet](https://twitter.com/MattDowle/status/1088544083499311104). The 758 CRAN and Bioconductor packages using data.table were tested and the maintainers of the 16 packages affected (2%) were consulted before going ahead. #### BUG FIXES From c615af8d52797bd09def6f5bbbe74897126bf267 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Sun, 3 Feb 2019 11:32:40 -0800 Subject: [PATCH 8/9] Reinstated USE_RINTERNALS so the progress in the branch can be merged and released. --- src/data.table.h | 1 + 1 file changed, 1 insertion(+) diff --git a/src/data.table.h b/src/data.table.h index 20c496fa27..498c6d6f88 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -1,4 +1,5 @@ #include +#define USE_RINTERNALS #include // #include // the debugging machinery + breakpoint aidee // raise(SIGINT); From 92b785ad6cb6b31c9590c5f4297c8dfd622f9873 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Sun, 3 Feb 2019 12:00:44 -0800 Subject: [PATCH 9/9] Added PR link to news item --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index e8bda624be..3a1036e651 100644 --- a/NEWS.md +++ b/NEWS.md @@ -4,7 +4,7 @@ #### NEW FEATURES -1. `:=` no longer recycles length>1 RHS vectors. There was a warning when recycling left a remainder but no warning when the LHS length was an exact multiple of the RHS length (the same behaviour as base R). Consistent feedback for several years has been that recycling is more often a bug. In rare cases where you need to recycle a length>1 vector, use `rep()` explicitly. Single values are still recycled silently as before. Early warning was given in [this tweet](https://twitter.com/MattDowle/status/1088544083499311104). The 758 CRAN and Bioconductor packages using data.table were tested and the maintainers of the 16 packages affected (2%) were consulted before going ahead. +1. `:=` no longer recycles length>1 RHS vectors. There was a warning when recycling left a remainder but no warning when the LHS length was an exact multiple of the RHS length (the same behaviour as base R). Consistent feedback for several years has been that recycling is more often a bug. In rare cases where you need to recycle a length>1 vector, use `rep()` explicitly. Single values are still recycled silently as before. Early warning was given in [this tweet](https://twitter.com/MattDowle/status/1088544083499311104). The 758 CRAN and Bioconductor packages using data.table were tested and the maintainers of the 16 packages affected (2%) were consulted before going ahead, [#3310](https://github.com/Rdatatable/data.table/pull/3310). #### BUG FIXES