diff --git a/NEWS.md b/NEWS.md index aece41efd9..ad673a4417 100644 --- a/NEWS.md +++ b/NEWS.md @@ -76,6 +76,8 @@ 10. `X[Y, .SD, by=]` (joining and grouping in the same query) could segfault if i) `by=` is supplied custom data (i.e. not simple expressions of columns), and ii) some rows of `Y` do not match to any rows in `X`, [#4892](https://github.com/Rdatatable/data.table/issues/4892). Thanks to @Kodiologist for reporting, @ColeMiller1 for investigating, and @tlapak for the PR. +11. Assigning a set of 2 or more all-NA values to a factor column could segfault, [#4824](https://github.com/Rdatatable/data.table/issues/4824). Thanks to @clerousset for reporting and @shrektan for fixing. + ## NOTES 1. New feature 29 in v1.12.4 (Oct 2019) introduced zero-copy coercion. Our thinking is that requiring you to get the type right in the case of `0` (type double) vs `0L` (type integer) is too inconvenient for you the user. So such coercions happen in `data.table` automatically without warning. Thanks to zero-copy coercion there is no speed penalty, even when calling `set()` many times in a loop, so there's no speed penalty to warn you about either. However, we believe that assigning a character value such as `"2"` into an integer column is more likely to be a user mistake that you would like to be warned about. The type difference (character vs integer) may be the only clue that you have selected the wrong column, or typed the wrong variable to be assigned to that column. For this reason we view character to numeric-like coercion differently and will warn about it. If it is correct, then the warning is intended to nudge you to wrap the RHS with `as.()` so that it is clear to readers of your code that a coercion from character to that type is intended. For example : diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index fb69d4d8b9..5db6c3fc2c 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17410,3 +17410,8 @@ x = data.table(id = 1:4, key = 'id') y = data.table(id = 2:5, key = 'id') z = data.table(c=c(2L, 2L, 1L, 1L), id=c(2L, 4L, 3L, NA)) test(2178, x[y, .SD, by=.(c(2L, 1L, 2L, 1L))], z) + +# assigning all-na length>1 to a factor column was segfault, #4824 +DT = data.table(FACTOR = factor(rep("a", 3L))) +set(DT, i=1:2, j="FACTOR", value=rep(NA, 2L)) +test(2179, DT$FACTOR, factor(c(NA, NA, "a"))) diff --git a/src/assign.c b/src/assign.c index e811276610..3b9aba0074 100644 --- a/src/assign.c +++ b/src/assign.c @@ -690,7 +690,7 @@ const char *memrecycle(const SEXP target, const SEXP where, const int start, con // sourceLen==1 is used in dogroups to recycle the group values into ans to match the nrow of each group's result; sourceStart is set to each group value row. { if (len<1) return NULL; - const int slen = sourceLen>=0 ? sourceLen : length(source); + int slen = sourceLen>=0 ? sourceLen : length(source); // since source may get reassigned to a scalar, we should not mark it as const if (slen==0) return NULL; if (sourceStart<0 || sourceStart+slen>length(source)) error(_("Internal error memrecycle: sourceStart=%d sourceLen=%d length(source)=%d"), sourceStart, sourceLen, length(source)); // # nocov @@ -718,7 +718,7 @@ const char *memrecycle(const SEXP target, const SEXP where, const int start, con } else if (!sourceIsFactor && !isString(source)) { // target is factor if (allNA(source, false)) { // return false for list and other types that allNA does not support - source = ScalarLogical(NA_LOGICAL); // a global constant in R and won't allocate; fall through to regular zero-copy coerce + source = ScalarLogical(NA_LOGICAL); slen = 1; // a global constant in R and won't allocate; fall through to regular zero-copy coerce } else if (isInteger(source) || isReal(source)) { // allow assigning level numbers to factor columns; test 425, 426, 429 and 1945 const int nlevel = length(getAttrib(target, R_LevelsSymbol));