From dd7612892d08aab229d2a07c13fe0f13d23fa41a Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Sat, 12 Jun 2021 09:18:21 -0400 Subject: [PATCH 01/13] tests that fail for melt(na.rm=TRUE) with list columns --- inst/tests/tests.Rraw | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 41f8978b17..ff9cea858a 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -3058,6 +3058,12 @@ test(1034, as.data.table(x<-as.character(sample(letters, 5))), data.table(V1=x)) # na.rm=TRUE with list column value, PR#4737 test(1035.016, melt(data.table(a1=1, b1=list(1:2), b2=list(c('foo','bar'))), na.rm=TRUE, measure.vars=list(a="a1", b=c("b1","b2"))), data.table(variable=factor(1), a=1, b=list(1:2))) test(1035.017, melt(data.table(a1=1, b1=1, b2=2), na.rm=TRUE, measure.vars=list(a="a1", b=c("b1","b2"))), data.table(variable=factor(1), a=1, b=1))#this worked even before the PR. + DT.list.missing = data.table(l1=list(1,NA), l2=list(NA,2), n34=c(3,4), NA5=c(NA,5)) + test(1035.0180, melt(DT.list.missing, measure.vars=c("n34","NA5"), na.rm=TRUE)[["value"]], c(3,4,5)) + test(1035.0181, melt(DT.list.missing, measure.vars=c("l1","l2"), na.rm=TRUE)[["value"]], list(1,2)) + test(1035.0182, melt(DT.list.missing, measure.vars=c("l1","n34"), na.rm=TRUE)[["value"]], list(1,3,4), warning="are not all of the same type") + test(1035.0183, melt(DT.list.missing, measure.vars=c("l1","NA5"), na.rm=TRUE)[["value"]], list(1,5), warning="are not all of the same type") + test(1035.0184, melt(DT.list.missing, measure.vars=list(l=c("l1","l2"), n=c("n34","NA5")), na.rm=TRUE), data.table(variable=factor(1:2), l=list(1,2), n=c(3,5))) ans1 = cbind(DT[, c(1,2,8), with=FALSE], variable=factor("l_1")) ans1[, value := DT$l_1] From 60162ff51bf48b32787c707ef8191c1abc3e0832 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Sat, 12 Jun 2021 09:19:49 -0400 Subject: [PATCH 02/13] change na.rm=TRUE example for list column --- man/melt.data.table.Rd | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/man/melt.data.table.Rd b/man/melt.data.table.Rd index ddca733fe8..b31017356b 100644 --- a/man/melt.data.table.Rd +++ b/man/melt.data.table.Rd @@ -141,8 +141,11 @@ melt(DT, id.vars=1:2, measure.vars=patterns(f="^f_", d="^d_"), value.factor=TRUE # na.rm=TRUE removes rows with NAs in any 'value' columns melt(DT, id.vars=1:2, measure.vars=patterns("f_", "d_"), value.factor=TRUE, na.rm=TRUE) -# return 'NA' for missing columns, 'na.rm=TRUE' ignored due to list column -melt(DT, id.vars=1:2, measure.vars=patterns("l_", "c_"), na.rm=TRUE) +# 'na.rm=TRUE' also works with list column, but note that is.na only +# returns TRUE if the list element is a length=1 vector with an NA. +is.na(list(one.NA=NA, two.NA=c(NA,NA))) +melt(DT, id.vars=1:2, measure.vars=patterns("l_", "d_"), na.rm=FALSE) +melt(DT, id.vars=1:2, measure.vars=patterns("l_", "d_"), na.rm=TRUE) # measure list with missing/short entries results in output with runs of NA DT.missing.cols <- DT[, .(d_1, d_2, c_1, f_2)] From e55ac23c9758815d70fef1bd974aafa8e935196d Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Sun, 13 Jun 2021 09:01:47 -0400 Subject: [PATCH 03/13] fix for list column in id.vars with na.rm=TRUE --- src/fmelt.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/src/fmelt.c b/src/fmelt.c index 8c204cb5ce..e24b562e97 100644 --- a/src/fmelt.c +++ b/src/fmelt.c @@ -729,10 +729,21 @@ SEXP getidcols(SEXP DT, SEXP dtnames, Rboolean verbose, struct processData *data } break; case VECSXP : { - for (int j=0; jlmax; ++j) { - for (int k=0; knrow; ++k) { - SET_VECTOR_ELT(target, j*data->nrow + k, VECTOR_ELT(thiscol, k)); + if (data->narm) { + for (int j=0; jlmax; ++j) { + SEXP thisidx = VECTOR_ELT(data->naidx, j); + const int *ithisidx = INTEGER(thisidx); + const int thislen = length(thisidx); + for (int k=0; klmax; ++j) { + for (int k=0; knrow; ++k) { + SET_VECTOR_ELT(target, j*data->nrow + k, VECTOR_ELT(thiscol, k)); + } + } } } break; From 2a0c17a6c36874b6d389c9068aafe84987335958 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Sun, 13 Jun 2021 20:36:20 -0400 Subject: [PATCH 04/13] TODO complex --- src/fmelt.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/fmelt.c b/src/fmelt.c index e24b562e97..b33d99adb4 100644 --- a/src/fmelt.c +++ b/src/fmelt.c @@ -520,6 +520,7 @@ SEXP getvaluecols(SEXP DT, SEXP dtnames, Rboolean valfactor, Rboolean verbose, s for (int k=0; knrow; ++k) SET_STRING_ELT(target, j*data->nrow + k, STRING_ELT(thiscol, k)); } break; + //TODO complex value type: case CPLXSXP: { } break; case REALSXP : { double *dtarget = REAL(target); const double *dthiscol = REAL(thiscol); From edc55070ace4aa4d6abd760eb52e5e341f7dc834 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Sun, 13 Jun 2021 20:43:48 -0400 Subject: [PATCH 05/13] dt_na case VECSXP --- src/frank.c | 35 ++++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/src/frank.c b/src/frank.c index 810baf85c5..02c840d88e 100644 --- a/src/frank.c +++ b/src/frank.c @@ -19,7 +19,7 @@ SEXP dt_na(SEXP x, SEXP cols) { for (int i=0; i Date: Sun, 13 Jun 2021 20:44:24 -0400 Subject: [PATCH 06/13] fix na.omit test result --- inst/tests/tests.Rraw | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index ff9cea858a..3cc3f3f87c 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -6518,7 +6518,7 @@ test(1459.12, .Call("CsubsetDT", DT, 5L, seq_along(DT)), setDT(as.data.frame(DT) # Test for na.omit with list, raw and complex types DT = data.table(x=c(1L,1L,NA), y=c(NA, NA, 1), z=as.raw(1:3), w=list(1,NA,2), v=c(1+5i, NA, NA)) -test(1460.1, na.omit(DT, cols="w"), DT) +test(1460.1, na.omit(DT, cols="w"), DT[c(1,3)]) test(1460.2, na.omit(DT, cols="v"), DT[1]) test(1460.3, na.omit(DT, cols=c("v", "y")), DT[0]) test(1460.4, na.omit(DT, cols=c("z", "v")), DT[1]) From bf653d3b3c1cf538993ad195cb336315410445fa Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Sun, 13 Jun 2021 20:51:21 -0400 Subject: [PATCH 07/13] test fails for complex NA --- inst/tests/tests.Rraw | 1 + 1 file changed, 1 insertion(+) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 3cc3f3f87c..dd757c5071 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -3064,6 +3064,7 @@ test(1034, as.data.table(x<-as.character(sample(letters, 5))), data.table(V1=x)) test(1035.0182, melt(DT.list.missing, measure.vars=c("l1","n34"), na.rm=TRUE)[["value"]], list(1,3,4), warning="are not all of the same type") test(1035.0183, melt(DT.list.missing, measure.vars=c("l1","NA5"), na.rm=TRUE)[["value"]], list(1,5), warning="are not all of the same type") test(1035.0184, melt(DT.list.missing, measure.vars=list(l=c("l1","l2"), n=c("n34","NA5")), na.rm=TRUE), data.table(variable=factor(1:2), l=list(1,2), n=c(3,5))) + test(1035.0185, melt(data.table(l=list(c(NA,NA), NA, NA_integer_, NA_real_, NA_complex_, NA_character_)), measure.vars="l", na.rm=TRUE)[["value"]], c(NA,NA)) ans1 = cbind(DT[, c(1,2,8), with=FALSE], variable=factor("l_1")) ans1[, value := DT$l_1] From 304d0ec63b321f1ac767e46b0b5bb5313b09aaf8 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Sun, 13 Jun 2021 20:57:19 -0400 Subject: [PATCH 08/13] dt_na case VECSXP element new case CPLXSXP --- inst/tests/tests.Rraw | 2 +- src/frank.c | 9 ++++++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index dd757c5071..38bff0da6c 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -3064,7 +3064,7 @@ test(1034, as.data.table(x<-as.character(sample(letters, 5))), data.table(V1=x)) test(1035.0182, melt(DT.list.missing, measure.vars=c("l1","n34"), na.rm=TRUE)[["value"]], list(1,3,4), warning="are not all of the same type") test(1035.0183, melt(DT.list.missing, measure.vars=c("l1","NA5"), na.rm=TRUE)[["value"]], list(1,5), warning="are not all of the same type") test(1035.0184, melt(DT.list.missing, measure.vars=list(l=c("l1","l2"), n=c("n34","NA5")), na.rm=TRUE), data.table(variable=factor(1:2), l=list(1,2), n=c(3,5))) - test(1035.0185, melt(data.table(l=list(c(NA,NA), NA, NA_integer_, NA_real_, NA_complex_, NA_character_)), measure.vars="l", na.rm=TRUE)[["value"]], c(NA,NA)) + test(1035.0185, melt(data.table(l=list(c(NA,NA), NA, NA_integer_, NA_real_, NA_complex_, NA_character_)), measure.vars="l", na.rm=TRUE)[["value"]], list(c(NA,NA))) ans1 = cbind(DT[, c(1,2,8), with=FALSE], variable=factor("l_1")) ans1[, value := DT$l_1] diff --git a/src/frank.c b/src/frank.c index 02c840d88e..b11bc856c3 100644 --- a/src/frank.c +++ b/src/frank.c @@ -77,8 +77,15 @@ SEXP dt_na(SEXP x, SEXP cols) { ians[j] |= (length(list_element)==1 && STRING_ELT(list_element,0) == NA_STRING); } break; + case CPLXSXP: { + if (length(list_element)==1) { + Rcomplex first_complex = COMPLEX_ELT(list_element, 0); + ians[j] |= (ISNAN(first_complex.r) || ISNAN(first_complex.i)); + } + } + break; case REALSXP: { - if(length(list_element)==1){ + if (length(list_element)==1) { const double first_real = REAL_ELT(list_element,0); if (INHERITS(list_element, char_integer64)) { ians[j] |= (DtoLL(first_real) == NA_INT64_LL); // TODO: can be == NA_INT64_D directly From 97b9a0658618bbeaa650c76215f55807cc7e0805 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Sun, 13 Jun 2021 21:17:55 -0400 Subject: [PATCH 09/13] melt on a data table with list columns --- NEWS.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/NEWS.md b/NEWS.md index 4e108fae47..e691aeac43 100644 --- a/NEWS.md +++ b/NEWS.md @@ -133,6 +133,8 @@ 19. A fix to `as.Date(c("", ...))` in R 4.0.3, [17909](https://bugs.r-project.org/bugzilla3/show_bug.cgi?id=17909), has been backported to `data.table::as.IDate()` so that it too now returns `NA` for the first item when it is blank, even in older versions of R back to 3.1.0, rather than the incorrect error `character string is not in a standard unambiguous format`, [#4676](https://github.com/Rdatatable/data.table/issues/4676). Thanks to Arun Srinivasan for reporting, and Michael Chirico both for the `data.table` PR and for submitting the patch to R that was accepted and included in R 4.0.3. +20. `melt` on a data table with list columns for `measure.vars` would silently ignore `na.rm=TRUE`. Now same logic as `is.na()` from base R is used (if list element is scalar NA then it is considered missing and removed). Thanks to Toby Dylan Hocking for the PRs [#4737](https://github.com/Rdatatable/data.table/pull/4737), [#5044](https://github.com/Rdatatable/data.table/issues/5044). + ## NOTES 1. New feature 29 in v1.12.4 (Oct 2019) introduced zero-copy coercion. Our thinking is that requiring you to get the type right in the case of `0` (type double) vs `0L` (type integer) is too inconvenient for you the user. So such coercions happen in `data.table` automatically without warning. Thanks to zero-copy coercion there is no speed penalty, even when calling `set()` many times in a loop, so there's no speed penalty to warn you about either. However, we believe that assigning a character value such as `"2"` into an integer column is more likely to be a user mistake that you would like to be warned about. The type difference (character vs integer) may be the only clue that you have selected the wrong column, or typed the wrong variable to be assigned to that column. For this reason we view character to numeric-like coercion differently and will warn about it. If it is correct, then the warning is intended to nudge you to wrap the RHS with `as.()` so that it is clear to readers of your code that a coercion from character to that type is intended. For example : From ac466a5667a9b5081f71a1bff2788c77c98911a8 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Sun, 13 Jun 2021 21:31:54 -0400 Subject: [PATCH 10/13] test removing NA_integer64_ --- inst/tests/tests.Rraw | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 38bff0da6c..e978072cf9 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -3064,7 +3064,7 @@ test(1034, as.data.table(x<-as.character(sample(letters, 5))), data.table(V1=x)) test(1035.0182, melt(DT.list.missing, measure.vars=c("l1","n34"), na.rm=TRUE)[["value"]], list(1,3,4), warning="are not all of the same type") test(1035.0183, melt(DT.list.missing, measure.vars=c("l1","NA5"), na.rm=TRUE)[["value"]], list(1,5), warning="are not all of the same type") test(1035.0184, melt(DT.list.missing, measure.vars=list(l=c("l1","l2"), n=c("n34","NA5")), na.rm=TRUE), data.table(variable=factor(1:2), l=list(1,2), n=c(3,5))) - test(1035.0185, melt(data.table(l=list(c(NA,NA), NA, NA_integer_, NA_real_, NA_complex_, NA_character_)), measure.vars="l", na.rm=TRUE)[["value"]], list(c(NA,NA))) + test(1035.0185, melt(data.table(l=list(c(NA,NA), NA, NA_integer_, NA_real_, NA_complex_, NA_character_, if(test_bit64)NA_integer64_)), measure.vars="l", na.rm=TRUE)[["value"]], list(c(NA,NA))) ans1 = cbind(DT[, c(1,2,8), with=FALSE], variable=factor("l_1")) ans1[, value := DT$l_1] From c8667afcfbcd78e4da02afc596a3cb6ab8eedb77 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Mon, 21 Jun 2021 14:27:48 -0600 Subject: [PATCH 11/13] INTEGER_ELT(x,i) => INTEGER(x)[i] (and similarly REAL_ELT etc) to pass R 3.1 --- src/frank.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/frank.c b/src/frank.c index b11bc856c3..6fe2228d82 100644 --- a/src/frank.c +++ b/src/frank.c @@ -66,11 +66,11 @@ SEXP dt_na(SEXP x, SEXP cols) { SEXP list_element = VECTOR_ELT(v, j); switch (TYPEOF(list_element)) { case LGLSXP: { - ians[j] |= (length(list_element)==1 && LOGICAL_ELT(list_element,0) == NA_LOGICAL); + ians[j] |= (length(list_element)==1 && LOGICAL(list_element)[0] == NA_LOGICAL); } break; case INTSXP: { - ians[j] |= (length(list_element)==1 && INTEGER_ELT(list_element,0) == NA_INTEGER); + ians[j] |= (length(list_element)==1 && INTEGER(list_element)[0] == NA_INTEGER); } break; case STRSXP: { @@ -79,14 +79,14 @@ SEXP dt_na(SEXP x, SEXP cols) { break; case CPLXSXP: { if (length(list_element)==1) { - Rcomplex first_complex = COMPLEX_ELT(list_element, 0); + Rcomplex first_complex = COMPLEX(list_element)[0]; ians[j] |= (ISNAN(first_complex.r) || ISNAN(first_complex.i)); } } break; case REALSXP: { if (length(list_element)==1) { - const double first_real = REAL_ELT(list_element,0); + const double first_real = REAL(list_element)[0]; if (INHERITS(list_element, char_integer64)) { ians[j] |= (DtoLL(first_real) == NA_INT64_LL); // TODO: can be == NA_INT64_D directly } else { From 43dbaaaf0527e95462b68016e56a88d07f6c97bf Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Mon, 21 Jun 2021 14:40:28 -0600 Subject: [PATCH 12/13] avoid DtoLL --- src/frank.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/frank.c b/src/frank.c index 6fe2228d82..bbcf8bc993 100644 --- a/src/frank.c +++ b/src/frank.c @@ -41,8 +41,7 @@ SEXP dt_na(SEXP x, SEXP cols) { case REALSXP: { const double *dv = REAL(v); if (INHERITS(v, char_integer64)) { - for (int j=0; j Date: Mon, 21 Jun 2021 14:49:01 -0600 Subject: [PATCH 13/13] more on int64 --- src/frank.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/frank.c b/src/frank.c index bbcf8bc993..2e9e14bcf1 100644 --- a/src/frank.c +++ b/src/frank.c @@ -39,11 +39,11 @@ SEXP dt_na(SEXP x, SEXP cols) { } break; case REALSXP: { - const double *dv = REAL(v); if (INHERITS(v, char_integer64)) { - for (int j=0; j