diff --git a/NEWS.md b/NEWS.md index 3eed2775fb..5d6a7da3ea 100644 --- a/NEWS.md +++ b/NEWS.md @@ -145,6 +145,8 @@ 25. `.SDcols=` is now documented in `?data.table` and it is now an error if the logical vector's length is not equal to the number of columns (consistent with `data.table`'s no-recycling policy; see new feature 1 in v1.12.2 Apr 2019), [#4115](https://github.com/Rdatatable/data.table/issues/4115). Thanks to @Henrik-P for reporting and Jan Gorecki for the PR. +26. `melt()` now outputs scalar logical `NA` instead of `NULL` in rows corresponding to missing list columns, for consistency with non-list columns when using `na.rm=TRUE`, [#5053](https://github.com/Rdatatable/data.table/pull/5053). Thanks to Toby Dylan Hocking for the PR. + ## NOTES 1. New feature 29 in v1.12.4 (Oct 2019) introduced zero-copy coercion. Our thinking is that requiring you to get the type right in the case of `0` (type double) vs `0L` (type integer) is too inconvenient for you the user. So such coercions happen in `data.table` automatically without warning. Thanks to zero-copy coercion there is no speed penalty, even when calling `set()` many times in a loop, so there's no speed penalty to warn you about either. However, we believe that assigning a character value such as `"2"` into an integer column is more likely to be a user mistake that you would like to be warned about. The type difference (character vs integer) may be the only clue that you have selected the wrong column, or typed the wrong variable to be assigned to that column. For this reason we view character to numeric-like coercion differently and will warn about it. If it is correct, then the warning is intended to nudge you to wrap the RHS with `as.()` so that it is clear to readers of your code that a coercion from character to that type is intended. For example : diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 1a890b2566..aa654a236c 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -3070,6 +3070,9 @@ test(1034, as.data.table(x<-as.character(sample(letters, 5))), data.table(V1=x)) if (test_bit64) NA_integer64_ else NA)), # 'else NA' otherwise NULL is not removed when test_bit64 is FALSE measure.vars="l", na.rm=TRUE)[["value"]], list(c(NA,NA))) + DT_missing_l_2 = data.table(num_1=1, num_2=2, list_1=list(1), list_3=list(3)) + test(1035.0186, melt(DT_missing_l_2, measure.vars=measure(value.name, char, sep="_"), na.rm=TRUE), data.table(char="1", num=1, list=list(1))) + test(1035.0187, melt(DT_missing_l_2, measure.vars=measure(value.name, char, sep="_"), na.rm=FALSE), data.table(char=c("1","2","3"), num=c(1,2,NA), list=list(1,NA,3))) ans1 = cbind(DT[, c(1,2,8), with=FALSE], variable=factor("l_1")) ans1[, value := DT$l_1] diff --git a/man/melt.data.table.Rd b/man/melt.data.table.Rd index b31017356b..6dd74291d5 100644 --- a/man/melt.data.table.Rd +++ b/man/melt.data.table.Rd @@ -58,9 +58,7 @@ variables. When all \code{measure.vars} are not of the same type, they'll be coerced according to the hierarchy \code{list} > \code{character} > \code{numeric > integer > logical}. For example, if any of the measure variables is a -\code{list}, then entire value column will be coerced to a list. Note that, -if the type of \code{value} column is a list, \code{na.rm = TRUE} will have no -effect. +\code{list}, then entire value column will be coerced to a list. From version \code{1.9.6}, \code{melt} gains a feature with \code{measure.vars} accepting a list of \code{character} or \code{integer} vectors as well to melt diff --git a/src/assign.c b/src/assign.c index af3768f81a..0dc38c9b0a 100644 --- a/src/assign.c +++ b/src/assign.c @@ -1097,8 +1097,9 @@ const char *memrecycle(const SEXP target, const SEXP where, const int start, con return memrecycle_message[0] ? memrecycle_message : NULL; } -void writeNA(SEXP v, const int from, const int n) +void writeNA(SEXP v, const int from, const int n, const bool listNA) // e.g. for use after allocVector() which does not initialize its result. +// listNA for #5503 { const int to = from-1+n; // writing to position 2147483647 in mind, 'i<=to' in loop conditions switch(TYPEOF(v)) { @@ -1133,8 +1134,14 @@ void writeNA(SEXP v, const int from, const int n) // If there's ever a way added to R API to pass NA_STRING to allocVector() to tell it to initialize with NA not "", would be great for (int i=from; i<=to; ++i) SET_STRING_ELT(v, i, NA_STRING); break; - case VECSXP: case EXPRSXP : - // although allocVector already initializes to R_NilValue, we use writeNA() in other places too, so we shouldn't skip this assign + case VECSXP: { + // See #5053 for comments and dicussion re listNA + // although allocVector initializes to R_NilValue, we use writeNA() in other places too, so we shouldn't skip the R_NilValue assign + // ScalarLogical(NA_LOGICAL) returns R's internal constant R_LogicalNAValue (no alloc and no protect needed) + const SEXP na = listNA ? ScalarLogical(NA_LOGICAL) : R_NilValue; + for (int i=from; i<=to; ++i) SET_VECTOR_ELT(v, i, na); + } break; + case EXPRSXP : for (int i=from; i<=to; ++i) SET_VECTOR_ELT(v, i, R_NilValue); break; default : @@ -1149,7 +1156,7 @@ SEXP allocNAVector(SEXPTYPE type, R_len_t n) // We guess that author of allocVector would have liked to initialize with NA but was prevented since memset // is restricted to one byte. SEXP v = PROTECT(allocVector(type, n)); - writeNA(v, 0, n); + writeNA(v, 0, n, false); UNPROTECT(1); return(v); } @@ -1159,7 +1166,7 @@ SEXP allocNAVectorLike(SEXP x, R_len_t n) { // TODO: remove allocNAVector above when usage in fastmean.c, fcast.c and fmelt.c can be adjusted; see comments in PR3724 SEXP v = PROTECT(allocVector(TYPEOF(x), n)); copyMostAttrib(x, v); - writeNA(v, 0, n); + writeNA(v, 0, n, false); UNPROTECT(1); return(v); } diff --git a/src/data.table.h b/src/data.table.h index 50d43a34a5..e897f6a8b4 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -123,7 +123,7 @@ SEXP growVector(SEXP x, R_len_t newlen); // assign.c SEXP allocNAVector(SEXPTYPE type, R_len_t n); SEXP allocNAVectorLike(SEXP x, R_len_t n); -void writeNA(SEXP v, const int from, const int n); +void writeNA(SEXP v, const int from, const int n, const bool listNA); void savetl_init(), savetl(SEXP s), savetl_end(); int checkOverAlloc(SEXP x); diff --git a/src/dogroups.c b/src/dogroups.c index 5bb9983408..be5d675b23 100644 --- a/src/dogroups.c +++ b/src/dogroups.c @@ -204,7 +204,7 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX } if (istarts[i] == NA_INTEGER || (LENGTH(order) && iorder[ istarts[i]-1 ]==NA_INTEGER)) { for (int j=0; j0 if (TYPEOF(source) != TYPEOF(target)) diff --git a/src/fmelt.c b/src/fmelt.c index 0ee7e70ef8..9bfe931d85 100644 --- a/src/fmelt.c +++ b/src/fmelt.c @@ -495,7 +495,7 @@ SEXP getvaluecols(SEXP DT, SEXP dtnames, Rboolean valfactor, Rboolean verbose, s SEXP thiscol = input_col_or_NULL(DT, data, thisvaluecols, i, j); if (thiscol == R_NilValue) { if (!data->narm) { - writeNA(target, j*data->nrow, data->nrow); + writeNA(target, j*data->nrow, data->nrow, true); // listNA=true #5053 } }else{ if (!copyattr && data->isidentical[i] && !data->isfactor[i]) { diff --git a/src/rbindlist.c b/src/rbindlist.c index 5dab7fff51..5d0b6547e5 100644 --- a/src/rbindlist.c +++ b/src/rbindlist.c @@ -406,7 +406,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg) if (!length(li)) continue; // NULL items in the list() of DT/DF; not if thisnrow==0 because we need to retain (unused) factor levels (#3508) int w = usenames ? colMap[i*ncol + j] : j; if (w==-1) { - writeNA(target, ansloc, thisnrow); + writeNA(target, ansloc, thisnrow, false); } else { SEXP thisCol = VECTOR_ELT(li, w); SEXP thisColStr = isFactor(thisCol) ? getAttrib(thisCol, R_LevelsSymbol) : (isString(thisCol) ? thisCol : VECTOR_ELT(coercedForFactor, i)); @@ -512,7 +512,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg) int w = usenames ? colMap[i*ncol + j] : j; SEXP thisCol; if (w==-1 || !length(thisCol=VECTOR_ELT(li, w))) { // !length for zeroCol warning above; #1871 - writeNA(target, ansloc, thisnrow); // writeNA is integer64 aware and writes INT64_MIN + writeNA(target, ansloc, thisnrow, false); // writeNA is integer64 aware and writes INT64_MIN } else { if ((TYPEOF(target)==VECSXP || TYPEOF(target)==EXPRSXP) && TYPEOF(thisCol)!=TYPEOF(target)) { // do an as.list() on the atomic column; #3528