From cadfd099a547075b9ffb6503550e8ac120076b51 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Fri, 25 Sep 2020 22:48:25 -0700 Subject: [PATCH 01/13] support missing values in measure.vars arg to melt --- inst/tests/tests.Rraw | 5 +++ src/fmelt.c | 93 ++++++++++++++++++++++++++++++------------- 2 files changed, 71 insertions(+), 27 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 058c7db3b2..e36e3752f2 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17138,3 +17138,8 @@ test(2153.4, address(ans$V1[[1L]]), address(ans$V1[[2L]])) # .NGRP doesn't chan test(2153.5, DT[, .(list(c(0L,.N,0L))), by=x], # c() here will create new object so this is ok anyway; i.e. address(.N) is not present in j's result data.table(x=1:2, V1=list(c(0L,1L,0L), c(0L,2L,0L)))) +# fix for #4027 via PR#TODO. +DT.wide = data.table(a2=2, b1=1, b2=2) +expected = data.table(variable=factor(1:2), a=c(NA,2), b=c(1,2)) +test(2154.1, melt(DT.wide, measure.vars=list(a=c(NA,1), b=2:3)), expected) +test(2154.2, melt(DT.wide, measure.vars=list(a=c(NA,"a2"), b=c("b1","b2"))), expected) diff --git a/src/fmelt.c b/src/fmelt.c index 22a4ac1fc5..6794a461cb 100644 --- a/src/fmelt.c +++ b/src/fmelt.c @@ -97,6 +97,21 @@ static const char *concat(SEXP vec, SEXP idx) { return ans; } +// input: character vector of column names (maybe missing), output: +// integer vector of column indices with NA_INTEGER in the positions +// with missing inputs. +SEXP chmatch_na(SEXP x, SEXP table){ + SEXP ans; + PROTECT(ans = chmatch(x, table, 0)); + for(int i=0; i ncol) + if (INTEGER(tmp)[i] != NA_INTEGER && (INTEGER(tmp)[i] <= 0 || INTEGER(tmp)[i] > ncol)){ error(_("One or more values in 'measure.vars' is invalid.")); + } else if (!LOGICAL(booltmp)[i]) targetcols++; else continue; } @@ -260,18 +276,27 @@ SEXP checkVars(SEXP DT, SEXP id, SEXP measure, Rboolean verbose) { } ans = PROTECT(allocVector(VECSXP, 2)); protecti++; SET_VECTOR_ELT(ans, 0, idcols); - SET_VECTOR_ELT(ans, 1, valuecols); + SET_VECTOR_ELT(ans, 1, valuecols);//List of integer vectors. UNPROTECT(protecti); return(ans); } struct processData { SEXP RCHK; // a 2 item list holding vars (result of checkVars) and naidx. PROTECTed up in fmelt so that preprocess() doesn't need to PROTECT. To pass rchk, #2865 - SEXP idcols, valuecols, naidx; // convenience pointers into RCHK[0][0], RCHK[0][1] and RCHK[1] respectively - int lids, lvalues, lmax, lmin, totlen, nrow; - int *isfactor, *leach, *isidentical; + SEXP idcols, + valuecols, // list with one element per output/value column, each + // element is an integer vector. + naidx; // convenience pointers into RCHK[0][0], RCHK[0][1] and RCHK[1] respectively + int *isfactor, + *leach, // length of each element of the valuecols(measure.vars) list. + *isidentical; // are all inputs for this value column the same type? + int lids, // number of id columns. + lvalues, // number of value columns. + lmax, //max length of valuecols elements / number of times to repeat ids. + totlen, // of output/long DT result of melt operation. + nrow; // of input/wide DT to be melted. SEXPTYPE *maxtype; - Rboolean narm; + Rboolean narm; // remove missing values? }; static void preprocess(SEXP DT, SEXP id, SEXP measure, SEXP varnames, SEXP valnames, Rboolean narm, Rboolean verbose, struct processData *data) { @@ -279,7 +304,7 @@ static void preprocess(SEXP DT, SEXP id, SEXP measure, SEXP varnames, SEXP valna SEXP vars,tmp,thiscol; SEXPTYPE type; int i,j; - data->lmax = 0; data->lmin = 0; data->totlen = 0; data->nrow = length(VECTOR_ELT(DT, 0)); + data->lmax = 0; data->totlen = 0; data->nrow = length(VECTOR_ELT(DT, 0)); SET_VECTOR_ELT(data->RCHK, 0, vars = checkVars(DT, id, measure, verbose)); data->idcols = VECTOR_ELT(vars, 0); data->valuecols = VECTOR_ELT(vars, 1); @@ -296,29 +321,36 @@ static void preprocess(SEXP DT, SEXP id, SEXP measure, SEXP varnames, SEXP valna data->isidentical = (int *)R_alloc(data->lvalues, sizeof(int)); data->isfactor = (int *)R_alloc(data->lvalues, sizeof(int)); data->maxtype = (SEXPTYPE *)R_alloc(data->lvalues, sizeof(SEXPTYPE)); - for (i=0; ilvalues; i++) { + // first find max type of each output column. + for (i=0; ilvalues; i++) { // for each output column. tmp = VECTOR_ELT(data->valuecols, i); data->leach[i] = length(tmp); data->isidentical[i] = 1; // TODO - why 1 and not Rboolean TRUE? data->isfactor[i] = 0; // seems to hold 2 below, so not an Rboolean FALSE here. TODO - better name for variable? data->maxtype[i] = 0; // R_alloc doesn't initialize so careful to here, relied on below data->lmax = (data->lmax > data->leach[i]) ? data->lmax : data->leach[i]; - data->lmin = (data->lmin < data->leach[i]) ? data->lmin : data->leach[i]; - for (j=0; jleach[i]; j++) { - thiscol = VECTOR_ELT(DT, INTEGER(tmp)[j]-1); - if (isFactor(thiscol)) { - data->isfactor[i] = (isOrdered(thiscol)) ? 2 : 1; - data->maxtype[i] = STRSXP; - } else { - type = TYPEOF(thiscol); - if (type > data->maxtype[i]) data->maxtype[i] = type; + for (j=0; jleach[i]; j++) { // for each input column. + int this_col_num = INTEGER(tmp)[j]; + if(this_col_num != NA_INTEGER){ + thiscol = VECTOR_ELT(DT, this_col_num-1); + if (isFactor(thiscol)) { + data->isfactor[i] = (isOrdered(thiscol)) ? 2 : 1; + data->maxtype[i] = STRSXP; + } else { + type = TYPEOF(thiscol); + if (type > data->maxtype[i]) data->maxtype[i] = type; + } } } for (j=0; jleach[i]; j++) { - thiscol = VECTOR_ELT(DT, INTEGER(tmp)[j]-1); - if ( (!isFactor(thiscol) && data->maxtype[i] != TYPEOF(thiscol)) || (isFactor(thiscol) && data->maxtype[i] != STRSXP) ) { - data->isidentical[i] = 0; - break; + int this_col_num = INTEGER(tmp)[j]; + if(this_col_num != NA_INTEGER){ + thiscol = VECTOR_ELT(DT, this_col_num-1); + if ( (!isFactor(thiscol) && data->maxtype[i] != TYPEOF(thiscol)) || + (isFactor(thiscol) && data->maxtype[i] != STRSXP) ) { + data->isidentical[i] = 0; + break; + } } } } @@ -427,18 +459,25 @@ SEXP getvaluecols(SEXP DT, SEXP dtnames, Rboolean valfactor, Rboolean verbose, s SEXP flevels = PROTECT(allocVector(VECSXP, data->lmax)); Rboolean *isordered = (Rboolean *)R_alloc(data->lmax, sizeof(Rboolean)); SEXP ansvals = PROTECT(allocVector(VECSXP, data->lvalues)); - for (int i=0; ilvalues; ++i) { + for (int i=0; ilvalues; ++i) {//for each output/value column. bool thisvalfactor = (data->maxtype[i] == VECSXP) ? false : valfactor; SEXP target = PROTECT(allocVector(data->maxtype[i], data->totlen)); // to keep rchk happy SET_VECTOR_ELT(ansvals, i, target); UNPROTECT(1); // still protected by virtue of being member of protected ansval. - SEXP thisvaluecols = VECTOR_ELT(data->valuecols, i); + SEXP thisvaluecols = VECTOR_ELT(data->valuecols, i); // integer vector of column ids. int counter = 0; bool copyattr = false; - for (int j=0; jlmax; ++j) { + for (int j=0; jlmax; ++j) {// for each input column. int thisprotecti = 0; - SEXP thiscol = (j < data->leach[i]) ? VECTOR_ELT(DT, INTEGER(thisvaluecols)[j]-1) - : allocNAVector(data->maxtype[i], data->nrow); + // TODO use this line of code if NA specified. + SEXP thiscol; + int input_column_num = INTEGER(thisvaluecols)[j]; + if (j >= data->leach[i] || // fewer indices than the max were specified. + input_column_num == NA_INTEGER) { // NA was specified. + thiscol = allocNAVector(data->maxtype[i], data->nrow); + }else{ + thiscol = VECTOR_ELT(DT, input_column_num-1); + } if (!copyattr && data->isidentical[i] && !data->isfactor[i]) { copyMostAttrib(thiscol, target); copyattr = true; From 522cc789b6c26e22006b2baf36a4a84bd9db7788 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Fri, 25 Sep 2020 23:12:40 -0700 Subject: [PATCH 02/13] PR4720 --- NEWS.md | 7 +++++++ inst/tests/tests.Rraw | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 308d427250..3e6668e135 100644 --- a/NEWS.md +++ b/NEWS.md @@ -6,6 +6,13 @@ ## NEW FEATURES +1. `melt.data.table()` now supports `NA` entries when specifying a + list of `measure.vars`, which translate into runs of missing values + in the output. Fixes + [#4027](https://github.com/Rdatatable/data.table/issues/4027) via + [PR#4720](https://github.com/Rdatatable/data.table/pull/4720) from + @tdhock. + ## BUG FIXES 1. `test.data.table()` could fail the 2nd time it is run by a user in the same R session on Windows due to not resetting locale properly after testing Chinese translation, [#4630](https://github.com/Rdatatable/data.table/pull/4630). Thanks to Cole Miller for investigating and fixing. diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index e36e3752f2..4d059fc772 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17138,7 +17138,7 @@ test(2153.4, address(ans$V1[[1L]]), address(ans$V1[[2L]])) # .NGRP doesn't chan test(2153.5, DT[, .(list(c(0L,.N,0L))), by=x], # c() here will create new object so this is ok anyway; i.e. address(.N) is not present in j's result data.table(x=1:2, V1=list(c(0L,1L,0L), c(0L,2L,0L)))) -# fix for #4027 via PR#TODO. +# fix for #4027 via PR#4720. DT.wide = data.table(a2=2, b1=1, b2=2) expected = data.table(variable=factor(1:2), a=c(NA,2), b=c(1,2)) test(2154.1, melt(DT.wide, measure.vars=list(a=c(NA,1), b=2:3)), expected) From 264169e73d71eda55eefd8790065fa741a7da4c9 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Sat, 26 Sep 2020 07:49:19 -0700 Subject: [PATCH 03/13] test/fix with id.vars --- inst/tests/tests.Rraw | 4 ++++ src/fmelt.c | 9 ++++++--- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 4d059fc772..73da3f0726 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17143,3 +17143,7 @@ DT.wide = data.table(a2=2, b1=1, b2=2) expected = data.table(variable=factor(1:2), a=c(NA,2), b=c(1,2)) test(2154.1, melt(DT.wide, measure.vars=list(a=c(NA,1), b=2:3)), expected) test(2154.2, melt(DT.wide, measure.vars=list(a=c(NA,"a2"), b=c("b1","b2"))), expected) +DTid <- data.table(DT.wide, id=1) +exid <- data.table(id=1, expected) +test(2154.3, melt(DTid, measure.vars=list(a=c(NA,1), b=2:3), id.vars="id"), exid) +test(2154.4, melt(DTid, measure.vars=list(a=c(NA,"a2"), b=c("b1","b2")), id.vars="id"), exid) diff --git a/src/fmelt.c b/src/fmelt.c index 6794a461cb..bb5c5f3554 100644 --- a/src/fmelt.c +++ b/src/fmelt.c @@ -153,6 +153,10 @@ static SEXP unlist_(SEXP xint) { return(ans); } +bool invalid_measure(int i, int ncol) { + return i != NA_INTEGER && (i <= 0 || i > ncol); +} + SEXP checkVars(SEXP DT, SEXP id, SEXP measure, Rboolean verbose) { int i, ncol=LENGTH(DT), targetcols=0, protecti=0, u=0, v=0; SEXP thiscol, idcols = R_NilValue, valuecols = R_NilValue, tmp, tmp2, booltmp, unqtmp, ans; @@ -218,9 +222,8 @@ SEXP checkVars(SEXP DT, SEXP id, SEXP measure, Rboolean verbose) { } booltmp = PROTECT(duplicated(tmp, FALSE)); protecti++; for (i=0; i ncol)){ + if (invalid_measure(INTEGER(tmp)[i], ncol)) error(_("One or more values in 'measure.vars' is invalid.")); - } else if (!LOGICAL(booltmp)[i]) targetcols++; else continue; } @@ -265,7 +268,7 @@ SEXP checkVars(SEXP DT, SEXP id, SEXP measure, Rboolean verbose) { tmp = PROTECT(unlist_(tmp2)); protecti++; } for (i=0; i ncol) + if (invalid_measure(INTEGER(tmp)[i], ncol)) error(_("One or more values in 'measure.vars' is invalid.")); } if (isNewList(measure)) valuecols = tmp2; From 9a58c4ce9927b94755f7af3712213ac87c52be48 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Sun, 27 Sep 2020 13:25:16 -0700 Subject: [PATCH 04/13] only link issue not PR in NEWS --- NEWS.md | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/NEWS.md b/NEWS.md index 3e6668e135..995565d172 100644 --- a/NEWS.md +++ b/NEWS.md @@ -8,10 +8,11 @@ 1. `melt.data.table()` now supports `NA` entries when specifying a list of `measure.vars`, which translate into runs of missing values - in the output. Fixes - [#4027](https://github.com/Rdatatable/data.table/issues/4027) via - [PR#4720](https://github.com/Rdatatable/data.table/pull/4720) from - @tdhock. + in the output. Useful for melting wide data tables with some + missing columns, + [#4027](https://github.com/Rdatatable/data.table/issues/4027). + Thanks to @vspinu for reporting, and @tdhock for implementing the + changes to fmelt. ## BUG FIXES From 0b69386aeadf0ab0cc02063dfdef7f4f8245b36f Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Sun, 27 Sep 2020 13:26:04 -0700 Subject: [PATCH 05/13] doc/exmple for missing entries of measure.vars list --- man/melt.data.table.Rd | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/man/melt.data.table.Rd b/man/melt.data.table.Rd index a9d69b5f66..a9403a4281 100644 --- a/man/melt.data.table.Rd +++ b/man/melt.data.table.Rd @@ -64,7 +64,11 @@ effect. From version \code{1.9.6}, \code{melt} gains a feature with \code{measure.vars} accepting a list of \code{character} or \code{integer} vectors as well to melt -into multiple columns in a single function call efficiently. The function +into multiple columns in a single function call efficiently. +If a vector in the list containts missing values, or is shorter than the +max length of the list elements, then the output will include runs of +missing values at the specified position, or at the end. +The function \code{\link{patterns}} can be used to provide regular expression patterns. When used along with \code{melt}, if \code{cols} argument is not provided, the patterns will be matched against \code{names(data)}, for convenience. @@ -134,6 +138,10 @@ melt(DT, id=1:2, measure=patterns("f_", "d_"), value.factor=TRUE, na.rm=TRUE) # return 'NA' for missing columns, 'na.rm=TRUE' ignored due to list column melt(DT, id=1:2, measure=patterns("l_", "c_"), na.rm=TRUE) +# measure list with missing/short entries results in output with runs of NA +DT.missing.cols <- DT[, .(d_1, d_2, c_1, f_2)] +melt(DT.missing.cols, measure=list(d=1:2, c="c_1", f=c(NA, "f_2"))) + } \seealso{ \code{\link{dcast}}, \url{https://cran.r-project.org/package=reshape} From 2a242e01a13b58612fa672eb9da940337b1e343d Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Sun, 27 Sep 2020 22:14:02 -0700 Subject: [PATCH 06/13] no newlines in news items --- NEWS.md | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/NEWS.md b/NEWS.md index 995565d172..6c189a2888 100644 --- a/NEWS.md +++ b/NEWS.md @@ -6,13 +6,7 @@ ## NEW FEATURES -1. `melt.data.table()` now supports `NA` entries when specifying a - list of `measure.vars`, which translate into runs of missing values - in the output. Useful for melting wide data tables with some - missing columns, - [#4027](https://github.com/Rdatatable/data.table/issues/4027). - Thanks to @vspinu for reporting, and @tdhock for implementing the - changes to fmelt. +1. `melt.data.table()` now supports `NA` entries when specifying a list of `measure.vars`, which translate into runs of missing values in the output. Useful for melting wide data tables with some missing columns, [#4027](https://github.com/Rdatatable/data.table/issues/4027). Thanks to @vspinu for reporting, and @tdhock for implementing the changes to fmelt. ## BUG FIXES From 50b7b2eca82833bff447fce9df7a2977823b6dc5 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Sun, 27 Sep 2020 22:16:57 -0700 Subject: [PATCH 07/13] fix typo --- man/melt.data.table.Rd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/man/melt.data.table.Rd b/man/melt.data.table.Rd index a9403a4281..7f3c742db2 100644 --- a/man/melt.data.table.Rd +++ b/man/melt.data.table.Rd @@ -65,7 +65,7 @@ effect. From version \code{1.9.6}, \code{melt} gains a feature with \code{measure.vars} accepting a list of \code{character} or \code{integer} vectors as well to melt into multiple columns in a single function call efficiently. -If a vector in the list containts missing values, or is shorter than the +If a vector in the list contains missing values, or is shorter than the max length of the list elements, then the output will include runs of missing values at the specified position, or at the end. The function From 6c8bb5158d2786913e9338670f6fbac08c3a86c3 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Mon, 28 Sep 2020 11:46:07 -0700 Subject: [PATCH 08/13] remove tabs --- src/fmelt.c | 37 ++++++++++++++++++------------------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/src/fmelt.c b/src/fmelt.c index bb5c5f3554..403c1989b3 100644 --- a/src/fmelt.c +++ b/src/fmelt.c @@ -288,7 +288,7 @@ struct processData { SEXP RCHK; // a 2 item list holding vars (result of checkVars) and naidx. PROTECTed up in fmelt so that preprocess() doesn't need to PROTECT. To pass rchk, #2865 SEXP idcols, valuecols, // list with one element per output/value column, each - // element is an integer vector. + // element is an integer vector. naidx; // convenience pointers into RCHK[0][0], RCHK[0][1] and RCHK[1] respectively int *isfactor, *leach, // length of each element of the valuecols(measure.vars) list. @@ -335,25 +335,25 @@ static void preprocess(SEXP DT, SEXP id, SEXP measure, SEXP varnames, SEXP valna for (j=0; jleach[i]; j++) { // for each input column. int this_col_num = INTEGER(tmp)[j]; if(this_col_num != NA_INTEGER){ - thiscol = VECTOR_ELT(DT, this_col_num-1); - if (isFactor(thiscol)) { - data->isfactor[i] = (isOrdered(thiscol)) ? 2 : 1; - data->maxtype[i] = STRSXP; - } else { - type = TYPEOF(thiscol); - if (type > data->maxtype[i]) data->maxtype[i] = type; - } + thiscol = VECTOR_ELT(DT, this_col_num-1); + if (isFactor(thiscol)) { + data->isfactor[i] = (isOrdered(thiscol)) ? 2 : 1; + data->maxtype[i] = STRSXP; + } else { + type = TYPEOF(thiscol); + if (type > data->maxtype[i]) data->maxtype[i] = type; + } } } for (j=0; jleach[i]; j++) { int this_col_num = INTEGER(tmp)[j]; if(this_col_num != NA_INTEGER){ - thiscol = VECTOR_ELT(DT, this_col_num-1); - if ( (!isFactor(thiscol) && data->maxtype[i] != TYPEOF(thiscol)) || - (isFactor(thiscol) && data->maxtype[i] != STRSXP) ) { - data->isidentical[i] = 0; - break; - } + thiscol = VECTOR_ELT(DT, this_col_num-1); + if ( (!isFactor(thiscol) && data->maxtype[i] != TYPEOF(thiscol)) || + (isFactor(thiscol) && data->maxtype[i] != STRSXP) ) { + data->isidentical[i] = 0; + break; + } } } } @@ -472,14 +472,13 @@ SEXP getvaluecols(SEXP DT, SEXP dtnames, Rboolean valfactor, Rboolean verbose, s bool copyattr = false; for (int j=0; jlmax; ++j) {// for each input column. int thisprotecti = 0; - // TODO use this line of code if NA specified. SEXP thiscol; int input_column_num = INTEGER(thisvaluecols)[j]; if (j >= data->leach[i] || // fewer indices than the max were specified. - input_column_num == NA_INTEGER) { // NA was specified. - thiscol = allocNAVector(data->maxtype[i], data->nrow); + input_column_num == NA_INTEGER) { // NA was specified. + thiscol = allocNAVector(data->maxtype[i], data->nrow); }else{ - thiscol = VECTOR_ELT(DT, input_column_num-1); + thiscol = VECTOR_ELT(DT, input_column_num-1); } if (!copyattr && data->isidentical[i] && !data->isfactor[i]) { copyMostAttrib(thiscol, target); From 0bf632356443ae386641b8eeb1dd0ae411b79e35 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Thu, 1 Oct 2020 17:08:26 -0700 Subject: [PATCH 09/13] fix segfault with na.rm=T --- inst/tests/tests.Rraw | 1 + src/fmelt.c | 10 ++++++---- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index db14c662e2..9b9cdc3e91 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17159,3 +17159,4 @@ DTid <- data.table(DT.wide, id=1) exid <- data.table(id=1, expected) test(2155.3, melt(DTid, measure.vars=list(a=c(NA,1), b=2:3), id.vars="id"), exid) test(2155.4, melt(DTid, measure.vars=list(a=c(NA,"a2"), b=c("b1","b2")), id.vars="id"), exid) +test(2155.5, melt(DT.wide, measure.vars=list(a=c(NA,1), b=2:3), na.rm=TRUE)[, .(a, b)], data.table(a=2, b=2))#not testing variable because it is not computed correctly, #4455 diff --git a/src/fmelt.c b/src/fmelt.c index 403c1989b3..d018a86d4e 100644 --- a/src/fmelt.c +++ b/src/fmelt.c @@ -442,11 +442,13 @@ SEXP getvaluecols(SEXP DT, SEXP dtnames, Rboolean valfactor, Rboolean verbose, s for (int i=0; ilmax; ++i) { SEXP tmp = PROTECT(allocVector(VECSXP, data->lvalues)); for (int j=0; jlvalues; ++j) { - if (i < data->leach[j]) { - SEXP thisvaluecols = VECTOR_ELT(data->valuecols, j); - SET_VECTOR_ELT(tmp, j, VECTOR_ELT(DT, INTEGER(thisvaluecols)[i]-1)); - } else { + SEXP thisvaluecols = VECTOR_ELT(data->valuecols, j); + int input_column_num = INTEGER(thisvaluecols)[i]; + if (j >= data->leach[j] || //fewer indices than the max were specified. + input_column_num == NA_INTEGER) { //NA was specified. SET_VECTOR_ELT(tmp, j, allocNAVector(data->maxtype[j], data->nrow)); + } else { + SET_VECTOR_ELT(tmp, j, VECTOR_ELT(DT, input_column_num-1)); } } tmp = PROTECT(dt_na(tmp, seqcols)); From 5725b25b37416b104f573dc2dc0b50b267f15220 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Sun, 4 Oct 2020 08:20:48 -0700 Subject: [PATCH 10/13] test with narm=TRUE that caused a segfault --- inst/tests/tests.Rraw | 3 +++ 1 file changed, 3 insertions(+) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 9b9cdc3e91..d3766eee7d 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -3014,6 +3014,9 @@ test(1034, as.data.table(x<-as.character(sample(letters, 5))), data.table(V1=x)) error="Unknown 'id.vars' type raw") test(1035.012, melt(DT, id.vars=1:3, measure.vars=as.raw(0)), error="Unknown 'measure.vars' type raw") + test(1035.013, melt(data.table(a=1, b=1), id.vars=c(1,1)), data.table(a=1, a.1=1, variable=factor("b"), value=1)) + test(1035.014, melt(data.table(a1=1, b1=1, b2=2), na.rm=TRUE, measure.vars=list(a="a1", b=c("b1","b2"))), data.table(variable=factor(1,c("1","2")), a=1, b=1)) + test(1035.015, melt(data.table(a=1+2i, b=1), id.vars="a"), error="Unknown column type 'complex' for column 'a' in 'data'") ans1 = cbind(DT[, c(1,2,8), with=FALSE], variable=factor("l_1")) ans1[, value := DT$l_1] From 854f18973c3dbfd44f20ba05806f63ea7a6c5dc4 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Sun, 4 Oct 2020 08:21:14 -0700 Subject: [PATCH 11/13] fix segfault via new fun input_col_or_na --- src/fmelt.c | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/src/fmelt.c b/src/fmelt.c index d018a86d4e..e64ebba32d 100644 --- a/src/fmelt.c +++ b/src/fmelt.c @@ -427,6 +427,16 @@ static SEXP combineFactorLevels(SEXP factorLevels, SEXP target, int * factorType return ans; } +SEXP input_col_or_na(SEXP DT, struct processData* data, SEXP thisvaluecols, int out_col, int in_col) { + if (in_col < data->leach[out_col]) { + int input_column_num = INTEGER(thisvaluecols)[in_col]; + if (input_column_num != NA_INTEGER) { + return VECTOR_ELT(DT, input_column_num-1); + } + } + return allocNAVector(data->maxtype[out_col], data->nrow); +} + SEXP getvaluecols(SEXP DT, SEXP dtnames, Rboolean valfactor, Rboolean verbose, struct processData *data) { for (int i=0; ilvalues; ++i) { SEXP thisvaluecols = VECTOR_ELT(data->valuecols, i); @@ -442,14 +452,8 @@ SEXP getvaluecols(SEXP DT, SEXP dtnames, Rboolean valfactor, Rboolean verbose, s for (int i=0; ilmax; ++i) { SEXP tmp = PROTECT(allocVector(VECSXP, data->lvalues)); for (int j=0; jlvalues; ++j) { - SEXP thisvaluecols = VECTOR_ELT(data->valuecols, j); - int input_column_num = INTEGER(thisvaluecols)[i]; - if (j >= data->leach[j] || //fewer indices than the max were specified. - input_column_num == NA_INTEGER) { //NA was specified. - SET_VECTOR_ELT(tmp, j, allocNAVector(data->maxtype[j], data->nrow)); - } else { - SET_VECTOR_ELT(tmp, j, VECTOR_ELT(DT, input_column_num-1)); - } + SEXP thisvaluecols = VECTOR_ELT(data->valuecols, j); + SET_VECTOR_ELT(tmp, j, input_col_or_na(DT, data, thisvaluecols, j, i)); } tmp = PROTECT(dt_na(tmp, seqcols)); SEXP w; @@ -474,14 +478,7 @@ SEXP getvaluecols(SEXP DT, SEXP dtnames, Rboolean valfactor, Rboolean verbose, s bool copyattr = false; for (int j=0; jlmax; ++j) {// for each input column. int thisprotecti = 0; - SEXP thiscol; - int input_column_num = INTEGER(thisvaluecols)[j]; - if (j >= data->leach[i] || // fewer indices than the max were specified. - input_column_num == NA_INTEGER) { // NA was specified. - thiscol = allocNAVector(data->maxtype[i], data->nrow); - }else{ - thiscol = VECTOR_ELT(DT, input_column_num-1); - } + SEXP thiscol = input_col_or_na(DT, data, thisvaluecols, i, j); if (!copyattr && data->isidentical[i] && !data->isfactor[i]) { copyMostAttrib(thiscol, target); copyattr = true; From 02180c0f8c4463e6cd1b73871fc5e0105c1d6635 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Sat, 8 May 2021 22:22:35 -0600 Subject: [PATCH 12/13] pass NA_INTEGER to chmatch instead of creating chmatch_na --- src/chmatch.c | 9 ++++++--- src/fmelt.c | 17 +---------------- 2 files changed, 7 insertions(+), 19 deletions(-) diff --git a/src/chmatch.c b/src/chmatch.c index f80e7dd2c7..a091e646f0 100644 --- a/src/chmatch.c +++ b/src/chmatch.c @@ -74,11 +74,14 @@ static SEXP chmatchMain(SEXP x, SEXP table, int nomatch, bool chin, bool chmatch } int nuniq=0; for (int i=0; i0) { savetl(s); tl=0; } if (tl==0) SET_TRUELENGTH(s, chmatchdup ? -(++nuniq) : -i-1); // first time seen this string in table } + // in future if we need NAs in x not to be matched to NAs in table ... + // if (!matchNAtoNA && TRUELENGTH(NA_STRING)<0) + // SET_TRUELENGTH(NA_STRING, 0); if (chmatchdup) { // chmatchdup() is basically base::pmatch() but without the partial matching part. For example : // chmatchdup(c("a", "a"), c("a", "a")) # 1,2 - the second 'a' in 'x' has a 2nd match in 'table' @@ -107,7 +110,7 @@ static SEXP chmatchMain(SEXP x, SEXP table, int nomatch, bool chin, bool chmatch for (int i=0; i Date: Sat, 8 May 2021 23:40:03 -0600 Subject: [PATCH 13/13] whitespace, and unrelated to this PR localized more i,j in loops (to protect from i,j being used accidentally outside those loops, not really for speed reasons) --- src/fmelt.c | 50 ++++++++++++++++++++++++-------------------------- 1 file changed, 24 insertions(+), 26 deletions(-) diff --git a/src/fmelt.c b/src/fmelt.c index d320e3c755..5357f905c0 100644 --- a/src/fmelt.c +++ b/src/fmelt.c @@ -139,22 +139,22 @@ static SEXP unlist_(SEXP xint) { } bool invalid_measure(int i, int ncol) { - return i != NA_INTEGER && (i <= 0 || i > ncol); + return (i<=0 && i!=NA_INTEGER) || i>ncol; } SEXP checkVars(SEXP DT, SEXP id, SEXP measure, Rboolean verbose) { - int i, ncol=LENGTH(DT), targetcols=0, protecti=0, u=0, v=0; + int ncol=LENGTH(DT), targetcols=0, protecti=0, u=0, v=0; SEXP thiscol, idcols = R_NilValue, valuecols = R_NilValue, tmp, tmp2, booltmp, unqtmp, ans; SEXP dtnames = PROTECT(getAttrib(DT, R_NamesSymbol)); protecti++; if (isNull(id) && isNull(measure)) { - for (i=0; i ncol) error(_("One or more values in 'id.vars' is invalid.")); else if (!LOGICAL(booltmp)[i]) targetcols++; @@ -180,7 +180,7 @@ SEXP checkVars(SEXP DT, SEXP id, SEXP measure, Rboolean verbose) { } unqtmp = PROTECT(allocVector(INTSXP, targetcols)); protecti++; u = 0; - for (i=0; i ncol) error(_("One or more values in 'id.vars' is invalid.")); } @@ -252,7 +252,7 @@ SEXP checkVars(SEXP DT, SEXP id, SEXP measure, Rboolean verbose) { if (isNewList(measure)) { tmp = PROTECT(unlist_(tmp2)); protecti++; } - for (i=0; ilmax = 0; data->totlen = 0; data->nrow = length(VECTOR_ELT(DT, 0)); SET_VECTOR_ELT(data->RCHK, 0, vars = checkVars(DT, id, measure, verbose)); data->idcols = VECTOR_ELT(vars, 0); @@ -310,14 +308,14 @@ static void preprocess(SEXP DT, SEXP id, SEXP measure, SEXP varnames, SEXP valna data->isfactor = (int *)R_alloc(data->lvalues, sizeof(int)); data->maxtype = (SEXPTYPE *)R_alloc(data->lvalues, sizeof(SEXPTYPE)); // first find max type of each output column. - for (i=0; ilvalues; i++) { // for each output column. + for (int i=0; ilvalues; ++i) { // for each output column. tmp = VECTOR_ELT(data->valuecols, i); data->leach[i] = length(tmp); data->isidentical[i] = 1; // TODO - why 1 and not Rboolean TRUE? data->isfactor[i] = 0; // seems to hold 2 below, so not an Rboolean FALSE here. TODO - better name for variable? data->maxtype[i] = 0; // R_alloc doesn't initialize so careful to here, relied on below data->lmax = (data->lmax > data->leach[i]) ? data->lmax : data->leach[i]; - for (j=0; jleach[i]; j++) { // for each input column. + for (int j=0; jleach[i]; ++j) { // for each input column. int this_col_num = INTEGER(tmp)[j]; if(this_col_num != NA_INTEGER){ thiscol = VECTOR_ELT(DT, this_col_num-1); @@ -330,7 +328,7 @@ static void preprocess(SEXP DT, SEXP id, SEXP measure, SEXP varnames, SEXP valna } } } - for (j=0; jleach[i]; j++) { + for (int j=0; jleach[i]; ++j) { int this_col_num = INTEGER(tmp)[j]; if(this_col_num != NA_INTEGER){ thiscol = VECTOR_ELT(DT, this_col_num-1);