From 44d6f6128741825341c74875dd63ef4a8a6d9c17 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Fri, 25 Jun 2021 11:31:02 -0400 Subject: [PATCH 1/7] melt(na.rm=TRUE) should remove rows with missing list column --- inst/tests/tests.Rraw | 1 + 1 file changed, 1 insertion(+) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 65c595d273..a36bbce998 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -3070,6 +3070,7 @@ test(1034, as.data.table(x<-as.character(sample(letters, 5))), data.table(V1=x)) if (test_bit64) NA_integer64_ else NA)), # 'else NA' otherwise NULL is not removed when test_bit64 is FALSE measure.vars="l", na.rm=TRUE)[["value"]], list(c(NA,NA))) + test(1035.0186, melt(data.table(list_1=list(1), list_3=list(3), num_1=1, num_2=2), measure.vars=measure(value.name, char, sep="_"), na.rm=TRUE), data.table(char="1", list=list(1), num=1)) ans1 = cbind(DT[, c(1,2,8), with=FALSE], variable=factor("l_1")) ans1[, value := DT$l_1] From 18e106ee3ad5f560621be45103fcbb60043d59ad Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Fri, 25 Jun 2021 12:25:58 -0400 Subject: [PATCH 2/7] use logical NA scalar instead of NULL for missing list elements --- src/assign.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/assign.c b/src/assign.c index af3768f81a..bed5aa07c7 100644 --- a/src/assign.c +++ b/src/assign.c @@ -1133,8 +1133,14 @@ void writeNA(SEXP v, const int from, const int n) // If there's ever a way added to R API to pass NA_STRING to allocVector() to tell it to initialize with NA not "", would be great for (int i=from; i<=to; ++i) SET_STRING_ELT(v, i, NA_STRING); break; - case VECSXP: case EXPRSXP : - // although allocVector already initializes to R_NilValue, we use writeNA() in other places too, so we shouldn't skip this assign + case VECSXP: + for (int i=from; i<=to; ++i) { + SEXP na_scalar = allocVector(LGLSXP, 1); + LOGICAL(na_scalar)[0] = NA_LOGICAL; + SET_VECTOR_ELT(v, i, na_scalar); + } + break; + case EXPRSXP : for (int i=from; i<=to; ++i) SET_VECTOR_ELT(v, i, R_NilValue); break; default : From 83ffa5dcf2f8a00cc8015dbe0f61215545774df9 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Fri, 25 Jun 2021 12:26:18 -0400 Subject: [PATCH 3/7] plonk list yields NA scalar instead of NULL --- inst/tests/tests.Rraw | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index a36bbce998..dce5a62dbf 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -1426,12 +1426,12 @@ test(467, DT[,list(newcol=7L,.SD),by=a], error="use := by group instead") # Test empty list column DT = data.table(a=1:3,b=4:6) -test(468, DT[,foo:=list()], data.table(a=1:3,b=4:6,foo=list())) +test(468, DT[,foo:=list()], data.table(a=1:3,b=4:6,foo=list(NA,NA,NA))) # Test plonk list -test(469, DT[,bar:=list(1,"a",3.14)], data.table(a=1:3,b=4:6,foo=list(),bar=list(1,"a",3.14))) +test(469, DT[,bar:=list(1,"a",3.14)], data.table(a=1:3,b=4:6,foo=list(NA,NA,NA),bar=list(1,"a",3.14))) # Test plonk list variable (to catch deparse treating j=list() specially) x = list(2,"b",2.718) -test(470, DT[,baz:=x], data.table(a=1:3,b=4:6,foo=list(),bar=list(1,"a",3.14),baz=list(2,"b",2.718))) +test(470, DT[,baz:=x], data.table(a=1:3,b=4:6,foo=list(NA,NA,NA),bar=list(1,"a",3.14),baz=list(2,"b",2.718))) # Test recycling list DT = data.table(a=1:4,b=5:8) test(471.1, DT[,foo:=list("a",2:3)], error="Supplied 2 items to be assigned to 4 items of column 'foo'.*recycle") @@ -1799,7 +1799,7 @@ test(612, DT[,v:=min(b),by=a], data.table(a=1:3,b=(1:9)/10,v=(1:3)/10,key="a")) # Assign to subset ok (NA initialized in the other items) ok : test(613, DT[J(2),w:=8.3]$w, rep(c(NA,8.3,NA),each=3)) test(614, DT[J(3),x:=9L]$x, rep(c(NA_integer_,NA_integer_,9L),each=3)) -test(615, DT[J(2),z:=list(list(c(10L,11L)))]$z, rep(list(NULL, 10:11, NULL),each=3)) +test(615, DT[J(2),z:=list(list(c(10L,11L)))]$z, rep(list(NA, 10:11, NA),each=3)) # Combining := by group with i test(616, DT[a>1,p:=sum(b)]$p, rep(c(NA,3.3),c(3,6))) test(617, DT[a>1,q:=sum(b),by=a]$q, rep(c(NA,1.5,1.8),each=3)) @@ -12150,9 +12150,9 @@ gc() dt = data.table(a=factor(1:5, levels=1:10), b=as.list(letters[1:5])) dt2 = data.table(a=as.factor(1:10)) test(1851.1, dt[dt2, .SD, by=.EACHI, on="a", .SDcols="b"], - data.table(a=as.factor(1:10), b={tmp=vector("list",10); tmp[1:5]=as.list(letters[1:5]); tmp})) + data.table(a=as.factor(1:10), b=list("a","b","c","d","e",NA,NA,NA,NA,NA))) test(1851.2, data.table(a = 1:2, b = list("yo", NULL))[.(1:3), on=.(a), x.b, by = .EACHI], - data.table(a = 1:3, x.b = list("yo", NULL, NULL))) + data.table(a = 1:3, x.b = list("yo", NULL, NA))) # test that indices are only used on exact name match, #2465 DT1 <- data.table(colname=c("test1","test2","test2","test3"), colname_with_suffix=c("other","test","includes test within","other")) @@ -14396,7 +14396,7 @@ test(2002.03, rbindlist( list(list(a=1L, b=2L, x=NULL), list(a=2L, b=NULL, x=NUL warning="Column 3 ['x'] of item 1 is length 0. This (and 2 others like it) has been filled with NA (NULL for list columns) to make each item uniform.") # tests from #1302 test(2002.04, rbindlist( list(list(a=1L,z=list()), list(a=2L, z=list("m"))) ), - data.table(a=1:2, z=list(NULL, "m")), + data.table(a=1:2, z=list(NA, "m")), warning="Column 2 ['z'] of item 1 is length 0. This (and 0 others like it) has been filled with NA") test(2002.05, rbindlist( list( list(a=1L, z=list("z")), list(a=2L, z=list(c("a","b"))) )), data.table(a=1:2, z=list("z", c("a","b")))) From a06651be4db1f9e4a4cad76e6a528d458195cf80 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Fri, 25 Jun 2021 14:04:49 -0400 Subject: [PATCH 4/7] melt outputs NA instead of NULL in rows corresponding to missing list columns --- NEWS.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/NEWS.md b/NEWS.md index cef5c3918b..5e1b0619e5 100644 --- a/NEWS.md +++ b/NEWS.md @@ -145,6 +145,8 @@ 25. `.SDcols=` is now documented in `?data.table` and it is now an error if the logical vector's length is not equal to the number of columns (consistent with `data.table`'s no-recycling policy; see new feature 1 in v1.12.2 Apr 2019), [#4115](https://github.com/Rdatatable/data.table/issues/4115). Thanks to @Henrik-P for reporting and Jan Gorecki for the PR. +26. `melt` now outputs scalar logical `NA` instead of `NULL` in rows corresponding to missing list columns, for consistency with non-list columns when using `na.rm=TRUE`, [#5053](https://github.com/Rdatatable/data.table/pull/5053). Thanks to Toby Dylan Hocking for the PR. + ## NOTES 1. New feature 29 in v1.12.4 (Oct 2019) introduced zero-copy coercion. Our thinking is that requiring you to get the type right in the case of `0` (type double) vs `0L` (type integer) is too inconvenient for you the user. So such coercions happen in `data.table` automatically without warning. Thanks to zero-copy coercion there is no speed penalty, even when calling `set()` many times in a loop, so there's no speed penalty to warn you about either. However, we believe that assigning a character value such as `"2"` into an integer column is more likely to be a user mistake that you would like to be warned about. The type difference (character vs integer) may be the only clue that you have selected the wrong column, or typed the wrong variable to be assigned to that column. For this reason we view character to numeric-like coercion differently and will warn about it. If it is correct, then the warning is intended to nudge you to wrap the RHS with `as.()` so that it is clear to readers of your code that a coercion from character to that type is intended. For example : From 771e39a4d15123c7678caff3d4236d6b1cd199a8 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Mon, 28 Jun 2021 09:55:37 -0400 Subject: [PATCH 5/7] melt(na.rm=FALSE) should return NA when input DT has missing list column --- inst/tests/tests.Rraw | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index dce5a62dbf..381fdbac08 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -3070,7 +3070,9 @@ test(1034, as.data.table(x<-as.character(sample(letters, 5))), data.table(V1=x)) if (test_bit64) NA_integer64_ else NA)), # 'else NA' otherwise NULL is not removed when test_bit64 is FALSE measure.vars="l", na.rm=TRUE)[["value"]], list(c(NA,NA))) - test(1035.0186, melt(data.table(list_1=list(1), list_3=list(3), num_1=1, num_2=2), measure.vars=measure(value.name, char, sep="_"), na.rm=TRUE), data.table(char="1", list=list(1), num=1)) + DT_missing_l_2 = data.table(num_1=1, num_2=2, list_1=list(1), list_3=list(3)) + test(1035.0186, melt(DT_missing_l_2, measure.vars=measure(value.name, char, sep="_"), na.rm=TRUE), data.table(char="1", num=1, list=list(1))) + test(1035.0187, melt(DT_missing_l_2, measure.vars=measure(value.name, char, sep="_"), na.rm=FALSE), data.table(char=c("1","2","3"), num=c(1,2,NA), list=list(1,NA,3))) ans1 = cbind(DT[, c(1,2,8), with=FALSE], variable=factor("l_1")) ans1[, value := DT$l_1] From 8af77e80978de749489388d40d636efe3d472903 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Thu, 1 Jul 2021 12:15:50 -0400 Subject: [PATCH 6/7] na.rm no longer ignored for list columns --- man/melt.data.table.Rd | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/man/melt.data.table.Rd b/man/melt.data.table.Rd index b31017356b..6dd74291d5 100644 --- a/man/melt.data.table.Rd +++ b/man/melt.data.table.Rd @@ -58,9 +58,7 @@ variables. When all \code{measure.vars} are not of the same type, they'll be coerced according to the hierarchy \code{list} > \code{character} > \code{numeric > integer > logical}. For example, if any of the measure variables is a -\code{list}, then entire value column will be coerced to a list. Note that, -if the type of \code{value} column is a list, \code{na.rm = TRUE} will have no -effect. +\code{list}, then entire value column will be coerced to a list. From version \code{1.9.6}, \code{melt} gains a feature with \code{measure.vars} accepting a list of \code{character} or \code{integer} vectors as well to melt From 44a46c8852cde31ce3ff6d58318ab7fff3f61437 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Fri, 16 Jul 2021 10:46:10 -0600 Subject: [PATCH 7/7] added listNA to writeNA, true just from melt for now --- NEWS.md | 2 +- inst/tests/tests.Rraw | 14 +++++++------- src/assign.c | 21 +++++++++++---------- src/data.table.h | 2 +- src/dogroups.c | 6 +++--- src/fmelt.c | 2 +- src/rbindlist.c | 4 ++-- 7 files changed, 26 insertions(+), 25 deletions(-) diff --git a/NEWS.md b/NEWS.md index a9a9ebec24..5d6a7da3ea 100644 --- a/NEWS.md +++ b/NEWS.md @@ -145,7 +145,7 @@ 25. `.SDcols=` is now documented in `?data.table` and it is now an error if the logical vector's length is not equal to the number of columns (consistent with `data.table`'s no-recycling policy; see new feature 1 in v1.12.2 Apr 2019), [#4115](https://github.com/Rdatatable/data.table/issues/4115). Thanks to @Henrik-P for reporting and Jan Gorecki for the PR. -26. `melt` now outputs scalar logical `NA` instead of `NULL` in rows corresponding to missing list columns, for consistency with non-list columns when using `na.rm=TRUE`, [#5053](https://github.com/Rdatatable/data.table/pull/5053). Thanks to Toby Dylan Hocking for the PR. +26. `melt()` now outputs scalar logical `NA` instead of `NULL` in rows corresponding to missing list columns, for consistency with non-list columns when using `na.rm=TRUE`, [#5053](https://github.com/Rdatatable/data.table/pull/5053). Thanks to Toby Dylan Hocking for the PR. ## NOTES diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 7a64d7edb0..aa654a236c 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -1426,12 +1426,12 @@ test(467, DT[,list(newcol=7L,.SD),by=a], error="use := by group instead") # Test empty list column DT = data.table(a=1:3,b=4:6) -test(468, DT[,foo:=list()], data.table(a=1:3,b=4:6,foo=list(NA,NA,NA))) +test(468, DT[,foo:=list()], data.table(a=1:3,b=4:6,foo=list())) # Test plonk list -test(469, DT[,bar:=list(1,"a",3.14)], data.table(a=1:3,b=4:6,foo=list(NA,NA,NA),bar=list(1,"a",3.14))) +test(469, DT[,bar:=list(1,"a",3.14)], data.table(a=1:3,b=4:6,foo=list(),bar=list(1,"a",3.14))) # Test plonk list variable (to catch deparse treating j=list() specially) x = list(2,"b",2.718) -test(470, DT[,baz:=x], data.table(a=1:3,b=4:6,foo=list(NA,NA,NA),bar=list(1,"a",3.14),baz=list(2,"b",2.718))) +test(470, DT[,baz:=x], data.table(a=1:3,b=4:6,foo=list(),bar=list(1,"a",3.14),baz=list(2,"b",2.718))) # Test recycling list DT = data.table(a=1:4,b=5:8) test(471.1, DT[,foo:=list("a",2:3)], error="Supplied 2 items to be assigned to 4 items of column 'foo'.*recycle") @@ -1799,7 +1799,7 @@ test(612, DT[,v:=min(b),by=a], data.table(a=1:3,b=(1:9)/10,v=(1:3)/10,key="a")) # Assign to subset ok (NA initialized in the other items) ok : test(613, DT[J(2),w:=8.3]$w, rep(c(NA,8.3,NA),each=3)) test(614, DT[J(3),x:=9L]$x, rep(c(NA_integer_,NA_integer_,9L),each=3)) -test(615, DT[J(2),z:=list(list(c(10L,11L)))]$z, rep(list(NA, 10:11, NA),each=3)) +test(615, DT[J(2),z:=list(list(c(10L,11L)))]$z, rep(list(NULL, 10:11, NULL),each=3)) # Combining := by group with i test(616, DT[a>1,p:=sum(b)]$p, rep(c(NA,3.3),c(3,6))) test(617, DT[a>1,q:=sum(b),by=a]$q, rep(c(NA,1.5,1.8),each=3)) @@ -12152,9 +12152,9 @@ gc() dt = data.table(a=factor(1:5, levels=1:10), b=as.list(letters[1:5])) dt2 = data.table(a=as.factor(1:10)) test(1851.1, dt[dt2, .SD, by=.EACHI, on="a", .SDcols="b"], - data.table(a=as.factor(1:10), b=list("a","b","c","d","e",NA,NA,NA,NA,NA))) + data.table(a=as.factor(1:10), b={tmp=vector("list",10); tmp[1:5]=as.list(letters[1:5]); tmp})) test(1851.2, data.table(a = 1:2, b = list("yo", NULL))[.(1:3), on=.(a), x.b, by = .EACHI], - data.table(a = 1:3, x.b = list("yo", NULL, NA))) + data.table(a = 1:3, x.b = list("yo", NULL, NULL))) # test that indices are only used on exact name match, #2465 DT1 <- data.table(colname=c("test1","test2","test2","test3"), colname_with_suffix=c("other","test","includes test within","other")) @@ -14398,7 +14398,7 @@ test(2002.03, rbindlist( list(list(a=1L, b=2L, x=NULL), list(a=2L, b=NULL, x=NUL warning="Column 3 ['x'] of item 1 is length 0. This (and 2 others like it) has been filled with NA (NULL for list columns) to make each item uniform.") # tests from #1302 test(2002.04, rbindlist( list(list(a=1L,z=list()), list(a=2L, z=list("m"))) ), - data.table(a=1:2, z=list(NA, "m")), + data.table(a=1:2, z=list(NULL, "m")), warning="Column 2 ['z'] of item 1 is length 0. This (and 0 others like it) has been filled with NA") test(2002.05, rbindlist( list( list(a=1L, z=list("z")), list(a=2L, z=list(c("a","b"))) )), data.table(a=1:2, z=list("z", c("a","b")))) diff --git a/src/assign.c b/src/assign.c index bed5aa07c7..0dc38c9b0a 100644 --- a/src/assign.c +++ b/src/assign.c @@ -1097,8 +1097,9 @@ const char *memrecycle(const SEXP target, const SEXP where, const int start, con return memrecycle_message[0] ? memrecycle_message : NULL; } -void writeNA(SEXP v, const int from, const int n) +void writeNA(SEXP v, const int from, const int n, const bool listNA) // e.g. for use after allocVector() which does not initialize its result. +// listNA for #5503 { const int to = from-1+n; // writing to position 2147483647 in mind, 'i<=to' in loop conditions switch(TYPEOF(v)) { @@ -1133,13 +1134,13 @@ void writeNA(SEXP v, const int from, const int n) // If there's ever a way added to R API to pass NA_STRING to allocVector() to tell it to initialize with NA not "", would be great for (int i=from; i<=to; ++i) SET_STRING_ELT(v, i, NA_STRING); break; - case VECSXP: - for (int i=from; i<=to; ++i) { - SEXP na_scalar = allocVector(LGLSXP, 1); - LOGICAL(na_scalar)[0] = NA_LOGICAL; - SET_VECTOR_ELT(v, i, na_scalar); - } - break; + case VECSXP: { + // See #5053 for comments and dicussion re listNA + // although allocVector initializes to R_NilValue, we use writeNA() in other places too, so we shouldn't skip the R_NilValue assign + // ScalarLogical(NA_LOGICAL) returns R's internal constant R_LogicalNAValue (no alloc and no protect needed) + const SEXP na = listNA ? ScalarLogical(NA_LOGICAL) : R_NilValue; + for (int i=from; i<=to; ++i) SET_VECTOR_ELT(v, i, na); + } break; case EXPRSXP : for (int i=from; i<=to; ++i) SET_VECTOR_ELT(v, i, R_NilValue); break; @@ -1155,7 +1156,7 @@ SEXP allocNAVector(SEXPTYPE type, R_len_t n) // We guess that author of allocVector would have liked to initialize with NA but was prevented since memset // is restricted to one byte. SEXP v = PROTECT(allocVector(type, n)); - writeNA(v, 0, n); + writeNA(v, 0, n, false); UNPROTECT(1); return(v); } @@ -1165,7 +1166,7 @@ SEXP allocNAVectorLike(SEXP x, R_len_t n) { // TODO: remove allocNAVector above when usage in fastmean.c, fcast.c and fmelt.c can be adjusted; see comments in PR3724 SEXP v = PROTECT(allocVector(TYPEOF(x), n)); copyMostAttrib(x, v); - writeNA(v, 0, n); + writeNA(v, 0, n, false); UNPROTECT(1); return(v); } diff --git a/src/data.table.h b/src/data.table.h index 50d43a34a5..e897f6a8b4 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -123,7 +123,7 @@ SEXP growVector(SEXP x, R_len_t newlen); // assign.c SEXP allocNAVector(SEXPTYPE type, R_len_t n); SEXP allocNAVectorLike(SEXP x, R_len_t n); -void writeNA(SEXP v, const int from, const int n); +void writeNA(SEXP v, const int from, const int n, const bool listNA); void savetl_init(), savetl(SEXP s), savetl_end(); int checkOverAlloc(SEXP x); diff --git a/src/dogroups.c b/src/dogroups.c index 5bb9983408..be5d675b23 100644 --- a/src/dogroups.c +++ b/src/dogroups.c @@ -204,7 +204,7 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX } if (istarts[i] == NA_INTEGER || (LENGTH(order) && iorder[ istarts[i]-1 ]==NA_INTEGER)) { for (int j=0; j0 if (TYPEOF(source) != TYPEOF(target)) diff --git a/src/fmelt.c b/src/fmelt.c index 0ee7e70ef8..9bfe931d85 100644 --- a/src/fmelt.c +++ b/src/fmelt.c @@ -495,7 +495,7 @@ SEXP getvaluecols(SEXP DT, SEXP dtnames, Rboolean valfactor, Rboolean verbose, s SEXP thiscol = input_col_or_NULL(DT, data, thisvaluecols, i, j); if (thiscol == R_NilValue) { if (!data->narm) { - writeNA(target, j*data->nrow, data->nrow); + writeNA(target, j*data->nrow, data->nrow, true); // listNA=true #5053 } }else{ if (!copyattr && data->isidentical[i] && !data->isfactor[i]) { diff --git a/src/rbindlist.c b/src/rbindlist.c index 5dab7fff51..5d0b6547e5 100644 --- a/src/rbindlist.c +++ b/src/rbindlist.c @@ -406,7 +406,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg) if (!length(li)) continue; // NULL items in the list() of DT/DF; not if thisnrow==0 because we need to retain (unused) factor levels (#3508) int w = usenames ? colMap[i*ncol + j] : j; if (w==-1) { - writeNA(target, ansloc, thisnrow); + writeNA(target, ansloc, thisnrow, false); } else { SEXP thisCol = VECTOR_ELT(li, w); SEXP thisColStr = isFactor(thisCol) ? getAttrib(thisCol, R_LevelsSymbol) : (isString(thisCol) ? thisCol : VECTOR_ELT(coercedForFactor, i)); @@ -512,7 +512,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg) int w = usenames ? colMap[i*ncol + j] : j; SEXP thisCol; if (w==-1 || !length(thisCol=VECTOR_ELT(li, w))) { // !length for zeroCol warning above; #1871 - writeNA(target, ansloc, thisnrow); // writeNA is integer64 aware and writes INT64_MIN + writeNA(target, ansloc, thisnrow, false); // writeNA is integer64 aware and writes INT64_MIN } else { if ((TYPEOF(target)==VECSXP || TYPEOF(target)==EXPRSXP) && TYPEOF(thisCol)!=TYPEOF(target)) { // do an as.list() on the atomic column; #3528