From 0a640f209403e2d4972194148deeccadda33143a Mon Sep 17 00:00:00 2001 From: Toby Hocking Date: Wed, 31 Jul 2024 22:35:30 -0400 Subject: [PATCH 1/5] warn about future change --- NEWS.md | 4 ++-- inst/tests/tests.Rraw | 8 ++++---- src/fmelt.c | 7 +++++-- 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/NEWS.md b/NEWS.md index 83f48d685f..dd6ce61ec7 100644 --- a/NEWS.md +++ b/NEWS.md @@ -70,8 +70,6 @@ 6. `patterns()` helper for `.SDcols` now accepts arguments `ignore.case`, `perl`, `fixed`, and `useBytes`, which are passed to `grep`, #5387. Thanks to @iago-pssjd for the feature request, and @tdhock for the implementation. -7. `melt` returns an integer column for `variable` when `measure.vars` is a list of length=1, consistent with the documented behavior, [#5209](https://github.com/Rdatatable/data.table/issues/5209). Thanks to @tdhock for reporting and fixing. Any users who were relying on this behavior can change `measure.vars=list("col_name")` (output `variable` was column name, now is column index/integer) to `measure.vars="col_name"` (`variable` still is column name). - 8. Adding a list column to an empty `data.table` works consistently with other column types, [#5738](https://github.com/Rdatatable/data.table/issues/5738). Thanks to Benjamin Schwendinger for the report and the fix. 9. In `DT[,j,by]`, `by` retains its attributes (e.g. class) when `j` is GForce optimized, [#5567](https://github.com/Rdatatable/data.table/issues/5567). Thanks to @danwwilson for the report, and @ben-schwen for the PR. @@ -94,6 +92,8 @@ ## NOTES +7. `melt` is documented that the returned `variable` column should contain integer column indices when `measure.vars` is a list, but when the list length=1, `variable` is actually a character column name, which is inconsistent with the documentation, [#5209](https://github.com/Rdatatable/data.table/issues/5209). To increase consistency in the next release, we plan to change `variable` to integer, so users who were relying on this behavior should change `measure.vars=list("col_name")` (output `variable` is column name, will be column index/integer) to `measure.vars="col_name"` (`variable` is column name before and after the planned change), and there is a new warning. + 1. `transform` method for data.table sped up substantially when creating new columns on large tables. Thanks to @OfekShilon for the report and PR. The implemented solution was proposed by @ColeMiller1. 2. The documentation for the `fill` argument in `rbind()` and `rbindlist()` now notes the expected behaviour for missing `list` columns when `fill=TRUE`, namely to use `NULL` (not `NA`), [#4198](https://github.com/Rdatatable/data.table/pull/4198). Thanks @sritchie73 for the proposal and fix. diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index b3447b04ed..57a348ce15 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17257,13 +17257,13 @@ exid = data.table(id=1, expected) test(2182.3, melt(DTid, measure.vars=list(a=c(NA,1), b=2:3), id.vars="id"), exid) test(2182.4, melt(DTid, measure.vars=list(a=c(NA,"a2"), b=c("b1","b2")), id.vars="id"), exid) test(2182.5, melt(DT.wide, measure.vars=list(a=c(NA,1), b=2:3), na.rm=TRUE), data.table(variable=factor(2), a=2, b=2)) -test(2182.6, melt(DT.wide, measure.vars=list(b=c("b1","b2"))), data.table(a2=2, variable=factor(c("1","2")), b=c(1,2))) # measure.vars named list length=1, #5065 +test(2182.6, melt(DT.wide, measure.vars=list(b=c("b1","b2"))), data.table(a2=2, variable=factor(c("b1","b2")), b=c(1,2)), warning="measure.vars is a list with length=1") # measure.vars named list length=1, #5065 # consistency between measure.vars=list with length=1 and length>1, #5209 -test(2182.71, melt(DT.wide, measure.vars=list("a2"), variable.factor=TRUE), data.table(b1=1, b2=2, variable=factor(1), value=2)) +test(2182.71, melt(DT.wide, measure.vars=list("a2"), variable.factor=TRUE), data.table(b1=1, b2=2, variable=factor("a2"), value=2), warning="measure.vars is a list with length=1") test(2182.72, melt(DT.wide, measure.vars=c("a2"), variable.factor=TRUE), data.table(b1=1, b2=2, variable=factor("a2"), value=2)) -test(2182.73, melt(DT.wide, measure.vars=list("a2"), variable.factor=FALSE), data.table(b1=1, b2=2, variable="1", value=2)) +test(2182.73, melt(DT.wide, measure.vars=list("a2"), variable.factor=FALSE), data.table(b1=1, b2=2, variable="a2", value=2), warning="measure.vars is a list with length=1") test(2182.74, melt(DT.wide, measure.vars=c("a2"), variable.factor=FALSE), data.table(b1=1, b2=2, variable="a2", value=2)) -test(2182.75, melt(data.table(a=10, b=20), measure.vars=list(n="a"), variable.factor=FALSE), data.table(b=20, variable="1", n=10))#thanks @mnazarov +test(2182.75, melt(data.table(a=10, b=20), measure.vars=list(n="a"), variable.factor=FALSE), data.table(b=20, variable="a", n=10), warning="measure.vars is a list with length=1")#thanks @mnazarov ### First block testing measurev # new variable_table attribute for measure.vars, PR#4731 for multiple issues diff --git a/src/fmelt.c b/src/fmelt.c index 51f4fbbb8d..365c29b878 100644 --- a/src/fmelt.c +++ b/src/fmelt.c @@ -595,9 +595,12 @@ SEXP getvarcols(SEXP DT, SEXP dtnames, Rboolean varfactor, Rboolean verbose, str if (data->lvalues==1 && length(VECTOR_ELT(data->valuecols, 0)) != data->lmax) error(_("Internal error: fmelt.c:getvarcols %d %d"), length(VECTOR_ELT(data->valuecols, 0)), data->lmax); // # nocov if (isNull(data->variable_table)) { + if (data->lvalues == 1 & data->measure_is_list) { + warning("measure.vars is a list with length=1, which according to documentation should return integer indices in the variable column, but currently returns character column names. To increase consistency in the next release, we plan to change variable to integer, so users who were relying on this behavior should change measure.vars=list('col_name') (output variable is column name, will be column index/integer) to measure.vars='col_name' (variable is column name before and after the planned change)."); + } if (!varfactor) { SET_VECTOR_ELT(ansvars, 0, target=allocVector(STRSXP, data->totlen)); - if (!data->measure_is_list) {//one value column to output. + if (data->lvalues == 1) {//one value column to output. TODO change to !data->measure_is_list const int *thisvaluecols = INTEGER(VECTOR_ELT(data->valuecols, 0)); for (int j=0, ansloc=0; jlmax; ++j) { const int thislen = data->narm ? length(VECTOR_ELT(data->not_NA_indices, j)) : data->nrow; @@ -616,7 +619,7 @@ SEXP getvarcols(SEXP DT, SEXP dtnames, Rboolean varfactor, Rboolean verbose, str SET_VECTOR_ELT(ansvars, 0, target=allocVector(INTSXP, data->totlen)); SEXP levels; int *td = INTEGER(target); - if (!data->measure_is_list) {//one value column to output. + if (data->lvalues == 1) {//one value column to output. TODO change to !data->measure_is_list SEXP thisvaluecols = VECTOR_ELT(data->valuecols, 0); int len = length(thisvaluecols); levels = PROTECT(allocVector(STRSXP, len)); protecti++; From a91541fceab3f89801e10377f5ae0af59881eea3 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Thu, 1 Aug 2024 09:11:53 -0400 Subject: [PATCH 2/5] add parens Co-authored-by: Michael Chirico --- src/fmelt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fmelt.c b/src/fmelt.c index 365c29b878..6349041fc6 100644 --- a/src/fmelt.c +++ b/src/fmelt.c @@ -595,7 +595,7 @@ SEXP getvarcols(SEXP DT, SEXP dtnames, Rboolean varfactor, Rboolean verbose, str if (data->lvalues==1 && length(VECTOR_ELT(data->valuecols, 0)) != data->lmax) error(_("Internal error: fmelt.c:getvarcols %d %d"), length(VECTOR_ELT(data->valuecols, 0)), data->lmax); // # nocov if (isNull(data->variable_table)) { - if (data->lvalues == 1 & data->measure_is_list) { + if ((data->lvalues == 1) & data->measure_is_list) { warning("measure.vars is a list with length=1, which according to documentation should return integer indices in the variable column, but currently returns character column names. To increase consistency in the next release, we plan to change variable to integer, so users who were relying on this behavior should change measure.vars=list('col_name') (output variable is column name, will be column index/integer) to measure.vars='col_name' (variable is column name before and after the planned change)."); } if (!varfactor) { From f6ed89ef0fbdc3e9e47f4ca79a894f4098ebeb5c Mon Sep 17 00:00:00 2001 From: Toby Hocking Date: Thu, 1 Aug 2024 09:13:29 -0400 Subject: [PATCH 3/5] link TODO issue 5247 --- src/fmelt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/fmelt.c b/src/fmelt.c index 365c29b878..bb51f728a3 100644 --- a/src/fmelt.c +++ b/src/fmelt.c @@ -600,7 +600,7 @@ SEXP getvarcols(SEXP DT, SEXP dtnames, Rboolean varfactor, Rboolean verbose, str } if (!varfactor) { SET_VECTOR_ELT(ansvars, 0, target=allocVector(STRSXP, data->totlen)); - if (data->lvalues == 1) {//one value column to output. TODO change to !data->measure_is_list + if (data->lvalues == 1) {//one value column to output. TODO #5247 change to !data->measure_is_list const int *thisvaluecols = INTEGER(VECTOR_ELT(data->valuecols, 0)); for (int j=0, ansloc=0; jlmax; ++j) { const int thislen = data->narm ? length(VECTOR_ELT(data->not_NA_indices, j)) : data->nrow; @@ -619,7 +619,7 @@ SEXP getvarcols(SEXP DT, SEXP dtnames, Rboolean varfactor, Rboolean verbose, str SET_VECTOR_ELT(ansvars, 0, target=allocVector(INTSXP, data->totlen)); SEXP levels; int *td = INTEGER(target); - if (data->lvalues == 1) {//one value column to output. TODO change to !data->measure_is_list + if (data->lvalues == 1) {//one value column to output. TODO #5247 change to !data->measure_is_list SEXP thisvaluecols = VECTOR_ELT(data->valuecols, 0); int len = length(thisvaluecols); levels = PROTECT(allocVector(STRSXP, len)); protecti++; From b5277d732798f8eda6ed6f16c9677bc35fd0f254 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Thu, 1 Aug 2024 09:15:20 -0400 Subject: [PATCH 4/5] Update NEWS.md Co-authored-by: Michael Chirico --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index dd6ce61ec7..fdc0db931b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -92,7 +92,7 @@ ## NOTES -7. `melt` is documented that the returned `variable` column should contain integer column indices when `measure.vars` is a list, but when the list length=1, `variable` is actually a character column name, which is inconsistent with the documentation, [#5209](https://github.com/Rdatatable/data.table/issues/5209). To increase consistency in the next release, we plan to change `variable` to integer, so users who were relying on this behavior should change `measure.vars=list("col_name")` (output `variable` is column name, will be column index/integer) to `measure.vars="col_name"` (`variable` is column name before and after the planned change), and there is a new warning. +7. `?melt` has long documented that the returned `variable` column should contain integer column indices when `measure.vars` is a list, but when the list length is 1, `variable` is actually a character column name, which is inconsistent with the documentation, [#5209](https://github.com/Rdatatable/data.table/issues/5209). To increase consistency in the next release, we plan to change `variable` to integer, so users who were relying on this behavior should change `measure.vars=list("col_name")` (output `variable` is column name, will be column index/integer) to `measure.vars="col_name"` (`variable` is column name before and after the planned change). For now, relying on this undocumented behavior throws a new warning. 1. `transform` method for data.table sped up substantially when creating new columns on large tables. Thanks to @OfekShilon for the report and PR. The implemented solution was proposed by @ColeMiller1. From e5b563f147cb9985056bba2bc59d881bd73b0c65 Mon Sep 17 00:00:00 2001 From: Toby Hocking Date: Thu, 1 Aug 2024 09:47:57 -0400 Subject: [PATCH 5/5] clarify warning --- src/fmelt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fmelt.c b/src/fmelt.c index 6722e952ff..77a48f6a43 100644 --- a/src/fmelt.c +++ b/src/fmelt.c @@ -596,7 +596,7 @@ SEXP getvarcols(SEXP DT, SEXP dtnames, Rboolean varfactor, Rboolean verbose, str error(_("Internal error: fmelt.c:getvarcols %d %d"), length(VECTOR_ELT(data->valuecols, 0)), data->lmax); // # nocov if (isNull(data->variable_table)) { if ((data->lvalues == 1) & data->measure_is_list) { - warning("measure.vars is a list with length=1, which according to documentation should return integer indices in the variable column, but currently returns character column names. To increase consistency in the next release, we plan to change variable to integer, so users who were relying on this behavior should change measure.vars=list('col_name') (output variable is column name, will be column index/integer) to measure.vars='col_name' (variable is column name before and after the planned change)."); + warning("measure.vars is a list with length=1, which as long documented should return integer indices in the 'variable' column, but currently returns character column names. To increase consistency in the next release, we plan to change 'variable' to integer, so users who were relying on this behavior should change measure.vars=list('col_name') (output variable is column name now, but will become column index/integer) to measure.vars='col_name' (variable is column name before and after the planned change)."); } if (!varfactor) { SET_VECTOR_ELT(ansvars, 0, target=allocVector(STRSXP, data->totlen));