From aaadbe16b2e0439d01fce162e5a57ab3d46c9a6b Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Sun, 3 Nov 2019 12:53:20 +0800 Subject: [PATCH 1/8] nafill gains nan_is_na argument --- NEWS.md | 2 ++ R/shift.R | 8 ++++---- inst/tests/tests.Rraw | 10 ++++++++++ man/nafill.Rd | 7 ++++--- src/data.table.h | 4 ++-- src/nafill.c | 15 +++++++++------ 6 files changed, 31 insertions(+), 15 deletions(-) diff --git a/NEWS.md b/NEWS.md index adc1c48362..62d498b379 100644 --- a/NEWS.md +++ b/NEWS.md @@ -6,6 +6,8 @@ ## NEW FEATURES +1. `nafill` and `setnafill` gain `nan_is_na` argument (default `TRUE`) to say whether `NaN` should be considered the same as `NA` for filling purposes, [#4020](https://github.com/Rdatatable/data.table/issues/4020). Thanks @AnonymousBoba for the suggestion. + ## BUG FIXES ## NOTES diff --git a/R/shift.R b/R/shift.R index 125bf41cef..99c1d2d56b 100644 --- a/R/shift.R +++ b/R/shift.R @@ -24,16 +24,16 @@ shift = function(x, n=1L, fill=NA, type=c("lag", "lead", "shift"), give.names=FA ans } -nafill = function(x, type=c("const","locf","nocb"), fill=NA, verbose=getOption("datatable.verbose")) { +nafill = function(x, type=c("const","locf","nocb"), fill=NA, nan_is_na=TRUE, verbose=getOption("datatable.verbose")) { type = match.arg(type) if (type!="const" && !missing(fill)) warning("argument 'fill' ignored, only make sense for type='const'") - .Call(CnafillR, x, type, fill, FALSE, NULL, verbose) + .Call(CnafillR, x, type, fill, nan_is_na, FALSE, NULL, verbose) } -setnafill = function(x, type=c("const","locf","nocb"), fill=NA, cols=seq_along(x), verbose=getOption("datatable.verbose")) { +setnafill = function(x, type=c("const","locf","nocb"), fill=NA, nan_is_na=TRUE, cols=seq_along(x), verbose=getOption("datatable.verbose")) { type = match.arg(type) if (type!="const" && !missing(fill)) warning("argument 'fill' ignored, only make sense for type='const'") - invisible(.Call(CnafillR, x, type, fill, TRUE, cols, verbose)) + invisible(.Call(CnafillR, x, type, fill, nan_is_na, TRUE, cols, verbose)) } diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 81d9fcaebd..c9f528ee8a 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -16370,6 +16370,16 @@ test(2120.07, iDT[(i_id), order(e_date, e_time)], c(3L,4L,1L,2L)) # wrapping wi test(2120.08, tmp[iDT[(i_id), order(e_date, e_time)]], # different result with the NA data.table(i_id=c("A",NA,"B","C"), N=c(5L,NA,5L,5L))) +# nan_is_na to treat NaN as NA in nafill, #4020 +x = c(NA, NaN, 0, 1, 2) +ans1 = c(0, 0, 0:2) +ans2 = c(0, NaN, 0:2) +DT = data.table(a=x, b=x) +test(2121.1, nafill(x, fill=0), ans1) +test(2121.2, nafill(x, fill=0, nan_is_na=FALSE), ans2) +test(2121.3, setnafill(DT, fill=0, cols=1L), copy(DT)[ , a := ans1]) +test(2121.4, setnafill(DT, fill=0, nan_is_na=FALSE), copy(DT)[ , c('a', 'b') := .(ans1, ans2)]) + ################################### # Add new tests above this line # diff --git a/man/nafill.Rd b/man/nafill.Rd index 8d7615e8bb..d4fb3dcd5f 100644 --- a/man/nafill.Rd +++ b/man/nafill.Rd @@ -10,15 +10,16 @@ Fast fill missing values using constant value, \emph{last observation carried forward} or \emph{next observation carried backward}. } \usage{ -nafill(x, type=c("const","locf","nocb"), fill=NA, +nafill(x, type=c("const","locf","nocb"), fill=NA, nan_is_na=TRUE, verbose=getOption("datatable.verbose")) -setnafill(x, type=c("const","locf","nocb"), fill=NA, cols=seq_along(x), - verbose=getOption("datatable.verbose")) +setnafill(x, type=c("const","locf","nocb"), fill=NA, nan_is_na=TRUE, + cols=seq_along(x), verbose=getOption("datatable.verbose")) } \arguments{ \item{x}{ vector, list, data.frame or data.table of numeric columns. } \item{type}{ character, one of \emph{"const"}, \emph{"locf"} or \emph{"nocb"}. Defaults to \code{"const"}. } \item{fill}{ numeric or integer, value to be used to fill when \code{type=="const"}. } + \item{nan_is_na}{ \code{logical}; should \code{NaN} be treated as \code{NA} during replacement? } \item{cols}{ numeric or character vector specifying columns to be updated. } \item{verbose}{ logical, \code{TRUE} turns on timing messages to the console. } } diff --git a/src/data.table.h b/src/data.table.h index 1f5c9cd5ff..3b9cd9cde2 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -200,9 +200,9 @@ SEXP frollfunR(SEXP fun, SEXP obj, SEXP k, SEXP fill, SEXP algo, SEXP align, SEX SEXP frollapplyR(SEXP fun, SEXP obj, SEXP k, SEXP fill, SEXP align, SEXP rho); // nafill.c -void nafillDouble(double *x, uint_fast64_t nx, unsigned int type, double fill, ans_t *ans, bool verbose); +void nafillDouble(double *x, uint_fast64_t nx, unsigned int type, double fill, bool nan_is_na, ans_t *ans, bool verbose); void nafillInteger(int32_t *x, uint_fast64_t nx, unsigned int type, int32_t fill, ans_t *ans, bool verbose); -SEXP nafillR(SEXP obj, SEXP type, SEXP fill, SEXP inplace, SEXP cols, SEXP verbose); +SEXP nafillR(SEXP obj, SEXP type, SEXP fill, SEXP nan_is_na_arg, SEXP inplace, SEXP cols, SEXP verbose); // between.c SEXP between(SEXP x, SEXP lower, SEXP upper, SEXP incbounds, SEXP NAbounds, SEXP check); diff --git a/src/nafill.c b/src/nafill.c index 841d9f6fb1..98bf0d4975 100644 --- a/src/nafill.c +++ b/src/nafill.c @@ -1,22 +1,22 @@ #include "data.table.h" -void nafillDouble(double *x, uint_fast64_t nx, unsigned int type, double fill, ans_t *ans, bool verbose) { +void nafillDouble(double *x, uint_fast64_t nx, unsigned int type, double fill, bool nan_is_na, ans_t *ans, bool verbose) { double tic=0.0; if (verbose) tic = omp_get_wtime(); if (type==0) { // const for (uint_fast64_t i=0; idbl_v[i] = ISNA(x[i]) ? fill : x[i]; + ans->dbl_v[i] = nan_is_na ? (ISNAN(x[i]) ? fill : x[i]) : (ISNA(x[i]) ? fill : x[i]); } } else if (type==1) { // locf ans->dbl_v[0] = x[0]; for (uint_fast64_t i=1; idbl_v[i] = ISNA(x[i]) ? ans->dbl_v[i-1] : x[i]; + ans->dbl_v[i] = nan_is_na ? (ISNAN(x[i]) ? ans->dbl_v[i-1] : x[i]) : (ISNA(x[i]) ? ans->dbl_v[i-1] : x[i]); } } else if (type==2) { // nocb ans->dbl_v[nx-1] = x[nx-1]; for (int_fast64_t i=nx-2; i>=0; i--) { - ans->dbl_v[i] = ISNA(x[i]) ? ans->dbl_v[i+1] : x[i]; + ans->dbl_v[i] = nan_is_na ? (ISNA(x[i]) ? ans->dbl_v[i+1] : x[i]) : (ISNA(x[i]) ? ans->dbl_v[i+1] : x[i]); } } if (verbose) @@ -67,7 +67,7 @@ void nafillInteger64(int64_t *x, uint_fast64_t nx, unsigned int type, int64_t fi snprintf(ans->message[0], 500, "%s: took %.3fs\n", __func__, omp_get_wtime()-tic); } -SEXP nafillR(SEXP obj, SEXP type, SEXP fill, SEXP inplace, SEXP cols, SEXP verbose) { +SEXP nafillR(SEXP obj, SEXP type, SEXP fill, SEXP nan_is_na_arg, SEXP inplace, SEXP cols, SEXP verbose) { int protecti=0; if (!IS_TRUE_OR_FALSE(verbose)) error("verbose must be TRUE or FALSE"); @@ -152,7 +152,10 @@ SEXP nafillR(SEXP obj, SEXP type, SEXP fill, SEXP inplace, SEXP cols, SEXP verbo if (INHERITS(this_x, char_integer64) || INHERITS(this_x, char_nanotime)) { // inside parallel region so can't call Rinherits() nafillInteger64(i64x[i], inx[i], itype, i64fill, &vans[i], bverbose); } else { - nafillDouble(dx[i], inx[i], itype, dfill, &vans[i], bverbose); + if (!IS_TRUE_OR_FALSE(nan_is_na_arg)) + error("nan_is_na must be TRUE or FALSE"); + bool nan_is_na = LOGICAL(nan_is_na_arg)[0]; + nafillDouble(dx[i], inx[i], itype, dfill, nan_is_na, &vans[i], bverbose); } } break; case INTSXP : { From 2d9e4e937c964a07a2669a343e7142d76bf5c796 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Sun, 3 Nov 2019 20:21:29 +0800 Subject: [PATCH 2/8] change to argument nan --- NEWS.md | 2 +- R/shift.R | 8 ++++---- R/utils.R | 7 +++++++ inst/tests/tests.Rraw | 4 ++-- man/nafill.Rd | 6 +++--- src/nafill.c | 30 ++++++++++++++++++++++++------ 6 files changed, 41 insertions(+), 16 deletions(-) diff --git a/NEWS.md b/NEWS.md index 62d498b379..70e66dbc3b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -6,7 +6,7 @@ ## NEW FEATURES -1. `nafill` and `setnafill` gain `nan_is_na` argument (default `TRUE`) to say whether `NaN` should be considered the same as `NA` for filling purposes, [#4020](https://github.com/Rdatatable/data.table/issues/4020). Thanks @AnonymousBoba for the suggestion. +1. `nafill` and `setnafill` gain `nan` argument (default `NA`, alternative `NaN`) to say whether `NaN` should be considered the same as `NA` for filling purposes, [#4020](https://github.com/Rdatatable/data.table/issues/4020). Thanks @AnonymousBoba for the suggestion. ## BUG FIXES diff --git a/R/shift.R b/R/shift.R index 99c1d2d56b..0299af0ff8 100644 --- a/R/shift.R +++ b/R/shift.R @@ -24,16 +24,16 @@ shift = function(x, n=1L, fill=NA, type=c("lag", "lead", "shift"), give.names=FA ans } -nafill = function(x, type=c("const","locf","nocb"), fill=NA, nan_is_na=TRUE, verbose=getOption("datatable.verbose")) { +nafill = function(x, type=c("const","locf","nocb"), fill=NA, nan=NA, verbose=getOption("datatable.verbose")) { type = match.arg(type) if (type!="const" && !missing(fill)) warning("argument 'fill' ignored, only make sense for type='const'") - .Call(CnafillR, x, type, fill, nan_is_na, FALSE, NULL, verbose) + .Call(CnafillR, x, type, fill, nan_is_na(nan), FALSE, NULL, verbose) } -setnafill = function(x, type=c("const","locf","nocb"), fill=NA, nan_is_na=TRUE, cols=seq_along(x), verbose=getOption("datatable.verbose")) { +setnafill = function(x, type=c("const","locf","nocb"), fill=NA, nan=NA, cols=seq_along(x), verbose=getOption("datatable.verbose")) { type = match.arg(type) if (type!="const" && !missing(fill)) warning("argument 'fill' ignored, only make sense for type='const'") - invisible(.Call(CnafillR, x, type, fill, nan_is_na, TRUE, cols, verbose)) + invisible(.Call(CnafillR, x, type, fill, nan_is_na(nan), TRUE, cols, verbose)) } diff --git a/R/utils.R b/R/utils.R index de35e709c0..06a62f2039 100644 --- a/R/utils.R +++ b/R/utils.R @@ -13,6 +13,13 @@ if (base::getRversion() < "3.5.0") { isTRUEorNA = function(x) is.logical(x) && length(x)==1L && (is.na(x) || x) isTRUEorFALSE = function(x) is.logical(x) && length(x)==1L && !is.na(x) allNA = function(x) .Call(C_allNAR, x) +# helper for nan argument (e.g. nafill): TRUE -> treat NaN as NA +nan_is_na = function(x) { + if (length(x) > 1L) stop("Argument 'nan' must be length 1") + if (identical(x, NA)) return(TRUE) + if (identical(x, NaN)) return(FALSE) + stop("Argument 'nan' must be NA (_not_ NA_real_) or NaN") +} if (base::getRversion() < "3.2.0") { # Apr 2015 isNamespaceLoaded = function(x) x %chin% loadedNamespaces() diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index c9f528ee8a..d9df73872d 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -16376,9 +16376,9 @@ ans1 = c(0, 0, 0:2) ans2 = c(0, NaN, 0:2) DT = data.table(a=x, b=x) test(2121.1, nafill(x, fill=0), ans1) -test(2121.2, nafill(x, fill=0, nan_is_na=FALSE), ans2) +test(2121.2, nafill(x, fill=0, nan=NaN), ans2) test(2121.3, setnafill(DT, fill=0, cols=1L), copy(DT)[ , a := ans1]) -test(2121.4, setnafill(DT, fill=0, nan_is_na=FALSE), copy(DT)[ , c('a', 'b') := .(ans1, ans2)]) +test(2121.4, setnafill(DT, fill=0, nan=NaN), copy(DT)[ , c('a', 'b') := .(ans1, ans2)]) ################################### diff --git a/man/nafill.Rd b/man/nafill.Rd index d4fb3dcd5f..00a71d3393 100644 --- a/man/nafill.Rd +++ b/man/nafill.Rd @@ -10,16 +10,16 @@ Fast fill missing values using constant value, \emph{last observation carried forward} or \emph{next observation carried backward}. } \usage{ -nafill(x, type=c("const","locf","nocb"), fill=NA, nan_is_na=TRUE, +nafill(x, type=c("const","locf","nocb"), fill=NA, nan=NA, verbose=getOption("datatable.verbose")) -setnafill(x, type=c("const","locf","nocb"), fill=NA, nan_is_na=TRUE, +setnafill(x, type=c("const","locf","nocb"), fill=NA, nan=NA, cols=seq_along(x), verbose=getOption("datatable.verbose")) } \arguments{ \item{x}{ vector, list, data.frame or data.table of numeric columns. } \item{type}{ character, one of \emph{"const"}, \emph{"locf"} or \emph{"nocb"}. Defaults to \code{"const"}. } \item{fill}{ numeric or integer, value to be used to fill when \code{type=="const"}. } - \item{nan_is_na}{ \code{logical}; should \code{NaN} be treated as \code{NA} during replacement? } + \item{nan}{ Either \code{NaN} or \code{NA}; if the former, \code{NaN} is treated as distinct from \code{NA}, otherwise, they are treated the same during replacement? } \item{cols}{ numeric or character vector specifying columns to be updated. } \item{verbose}{ logical, \code{TRUE} turns on timing messages to the console. } } diff --git a/src/nafill.c b/src/nafill.c index 98bf0d4975..bc1c99a743 100644 --- a/src/nafill.c +++ b/src/nafill.c @@ -5,18 +5,36 @@ void nafillDouble(double *x, uint_fast64_t nx, unsigned int type, double fill, b if (verbose) tic = omp_get_wtime(); if (type==0) { // const - for (uint_fast64_t i=0; idbl_v[i] = nan_is_na ? (ISNAN(x[i]) ? fill : x[i]) : (ISNA(x[i]) ? fill : x[i]); + if (nan_is_na) { + for (uint_fast64_t i=0; idbl_v[i] = ISNAN(x[i]) ? fill : x[i]; + } + } else { + for (uint_fast64_t i=0; idbl_v[i] = ISNA(x[i]) ? fill : x[i]; + } } } else if (type==1) { // locf ans->dbl_v[0] = x[0]; - for (uint_fast64_t i=1; idbl_v[i] = nan_is_na ? (ISNAN(x[i]) ? ans->dbl_v[i-1] : x[i]) : (ISNA(x[i]) ? ans->dbl_v[i-1] : x[i]); + if (nan_is_na) { + for (uint_fast64_t i=1; idbl_v[i] = ISNAN(x[i]) ? ans->dbl_v[i-1] : x[i]; + } + } else { + for (uint_fast64_t i=1; idbl_v[i] = ISNA(x[i]) ? ans->dbl_v[i-1] : x[i]; + } } } else if (type==2) { // nocb ans->dbl_v[nx-1] = x[nx-1]; - for (int_fast64_t i=nx-2; i>=0; i--) { - ans->dbl_v[i] = nan_is_na ? (ISNA(x[i]) ? ans->dbl_v[i+1] : x[i]) : (ISNA(x[i]) ? ans->dbl_v[i+1] : x[i]); + if (nan_is_na) { + for (int_fast64_t i=nx-2; i>=0; i--) { + ans->dbl_v[i] = ISNAN(x[i]) ? ans->dbl_v[i+1] : x[i]; + } + } else { + for (int_fast64_t i=nx-2; i>=0; i--) { + ans->dbl_v[i] = ISNA(x[i]) ? ans->dbl_v[i+1] : x[i]; + } } } if (verbose) From 28803ff4901001aa2f3cf170abbe10f8cf9a32ec Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Sun, 3 Nov 2019 20:22:37 +0800 Subject: [PATCH 3/8] note in manual --- man/nafill.Rd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/man/nafill.Rd b/man/nafill.Rd index 00a71d3393..6583adcd14 100644 --- a/man/nafill.Rd +++ b/man/nafill.Rd @@ -19,7 +19,7 @@ setnafill(x, type=c("const","locf","nocb"), fill=NA, nan=NA, \item{x}{ vector, list, data.frame or data.table of numeric columns. } \item{type}{ character, one of \emph{"const"}, \emph{"locf"} or \emph{"nocb"}. Defaults to \code{"const"}. } \item{fill}{ numeric or integer, value to be used to fill when \code{type=="const"}. } - \item{nan}{ Either \code{NaN} or \code{NA}; if the former, \code{NaN} is treated as distinct from \code{NA}, otherwise, they are treated the same during replacement? } + \item{nan}{ (numeric \code{x} only) Either \code{NaN} or \code{NA}; if the former, \code{NaN} is treated as distinct from \code{NA}, otherwise, they are treated the same during replacement? } \item{cols}{ numeric or character vector specifying columns to be updated. } \item{verbose}{ logical, \code{TRUE} turns on timing messages to the console. } } From 6954f3b1c00f2c224de600afa33f691b4ed2924e Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Mon, 4 Nov 2019 00:51:03 +0800 Subject: [PATCH 4/8] fix tests, update NEWS, allow NA_real_ value --- NEWS.md | 2 +- R/utils.R | 4 ++-- inst/tests/nafill.Rraw | 14 ++++++++++++-- inst/tests/tests.Rraw | 10 ---------- 4 files changed, 15 insertions(+), 15 deletions(-) diff --git a/NEWS.md b/NEWS.md index 70e66dbc3b..aacff8711d 100644 --- a/NEWS.md +++ b/NEWS.md @@ -6,7 +6,7 @@ ## NEW FEATURES -1. `nafill` and `setnafill` gain `nan` argument (default `NA`, alternative `NaN`) to say whether `NaN` should be considered the same as `NA` for filling purposes, [#4020](https://github.com/Rdatatable/data.table/issues/4020). Thanks @AnonymousBoba for the suggestion. +1. `nafill` and `setnafill` gain `nan` argument to say whether `NaN` should be considered the same as `NA` for filling purposes, [#4020](https://github.com/Rdatatable/data.table/issues/4020). Prior versions had an implicit value of `nan=NaN`; the default is now `nan=NA`, i.e., `NaN` is treated as if it's missing. Thanks @AnonymousBoba for the suggestion. ## BUG FIXES diff --git a/R/utils.R b/R/utils.R index 06a62f2039..9af8a37238 100644 --- a/R/utils.R +++ b/R/utils.R @@ -16,9 +16,9 @@ allNA = function(x) .Call(C_allNAR, x) # helper for nan argument (e.g. nafill): TRUE -> treat NaN as NA nan_is_na = function(x) { if (length(x) > 1L) stop("Argument 'nan' must be length 1") - if (identical(x, NA)) return(TRUE) + if (identical(x, NA) || identical(x, NA_real_)) return(TRUE) if (identical(x, NaN)) return(FALSE) - stop("Argument 'nan' must be NA (_not_ NA_real_) or NaN") + stop("Argument 'nan' must be NA or NaN") } if (base::getRversion() < "3.2.0") { # Apr 2015 diff --git a/inst/tests/nafill.Rraw b/inst/tests/nafill.Rraw index ecaf610499..957f0c48a8 100644 --- a/inst/tests/nafill.Rraw +++ b/inst/tests/nafill.Rraw @@ -44,8 +44,8 @@ z = y z[5L] = NaN z[2L] = Inf z[9L] = -Inf -test(1.21, nafill(z, "locf"), c(NA,Inf,3,4,NaN,NaN,7,8,-Inf,-Inf)/2) -test(1.22, nafill(z, "nocb"), c(Inf,Inf,3,4,NaN,7,7,8,-Inf,NA)/2) +test(1.21, nafill(z, "locf"), c(NA,Inf,3,4,4,4,7,8,-Inf,-Inf)/2) +test(1.22, nafill(z, "nocb"), c(Inf,Inf,3,4,7,7,7,8,-Inf,NA)/2) dt = data.table(x, y, z) test(1.31, nafill(dt, "locf"), unname(lapply(dt, nafill, "locf"))) test(1.32, nafill(dt, "nocb"), unname(lapply(dt, nafill, "nocb"))) @@ -186,3 +186,13 @@ if (test_bit64) { test(6.53, identical(coerceFill(-2147483648), list(NA_integer_, -2147483648, as.integer64("-2147483648")))) test(6.54, identical(coerceFill(-2147483649), list(NA_integer_, -2147483649, as.integer64("-2147483649")))) } + +# nan argument to treat NaN as NA in nafill, #4020 +x = c(NA, NaN, 0, 1, 2) +ans1 = c(0, 0, 0:2) +ans2 = c(0, NaN, 0:2) +DT = data.table(a=x, b=x) +test(7.1, nafill(x, fill=0), ans1) +test(7.2, nafill(x, fill=0, nan=NaN), ans2) +test(7.3, setnafill(DT, fill=0, cols=1L), copy(DT)[ , a := ans1]) +test(7.4, setnafill(DT, fill=0, nan=NaN), copy(DT)[ , c('a', 'b') := .(ans1, ans2)]) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index d9df73872d..81d9fcaebd 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -16370,16 +16370,6 @@ test(2120.07, iDT[(i_id), order(e_date, e_time)], c(3L,4L,1L,2L)) # wrapping wi test(2120.08, tmp[iDT[(i_id), order(e_date, e_time)]], # different result with the NA data.table(i_id=c("A",NA,"B","C"), N=c(5L,NA,5L,5L))) -# nan_is_na to treat NaN as NA in nafill, #4020 -x = c(NA, NaN, 0, 1, 2) -ans1 = c(0, 0, 0:2) -ans2 = c(0, NaN, 0:2) -DT = data.table(a=x, b=x) -test(2121.1, nafill(x, fill=0), ans1) -test(2121.2, nafill(x, fill=0, nan=NaN), ans2) -test(2121.3, setnafill(DT, fill=0, cols=1L), copy(DT)[ , a := ans1]) -test(2121.4, setnafill(DT, fill=0, nan=NaN), copy(DT)[ , c('a', 'b') := .(ans1, ans2)]) - ################################### # Add new tests above this line # From 24579d06cad24e50cd739e4dc70cc0b09dd8097a Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Fri, 8 Nov 2019 19:50:07 +0800 Subject: [PATCH 5/8] coverage --- R/utils.R | 2 +- inst/tests/nafill.Rraw | 8 ++++++-- src/nafill.c | 2 +- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/R/utils.R b/R/utils.R index 9af8a37238..fe5ac0ef9f 100644 --- a/R/utils.R +++ b/R/utils.R @@ -15,7 +15,7 @@ isTRUEorFALSE = function(x) is.logical(x) && length(x)==1L && !is.na(x) allNA = function(x) .Call(C_allNAR, x) # helper for nan argument (e.g. nafill): TRUE -> treat NaN as NA nan_is_na = function(x) { - if (length(x) > 1L) stop("Argument 'nan' must be length 1") + if (length(x) != 1L) stop("Argument 'nan' must be length 1") if (identical(x, NA) || identical(x, NA_real_)) return(TRUE) if (identical(x, NaN)) return(FALSE) stop("Argument 'nan' must be NA or NaN") diff --git a/inst/tests/nafill.Rraw b/inst/tests/nafill.Rraw index 957f0c48a8..41d69b173c 100644 --- a/inst/tests/nafill.Rraw +++ b/inst/tests/nafill.Rraw @@ -194,5 +194,9 @@ ans2 = c(0, NaN, 0:2) DT = data.table(a=x, b=x) test(7.1, nafill(x, fill=0), ans1) test(7.2, nafill(x, fill=0, nan=NaN), ans2) -test(7.3, setnafill(DT, fill=0, cols=1L), copy(DT)[ , a := ans1]) -test(7.4, setnafill(DT, fill=0, nan=NaN), copy(DT)[ , c('a', 'b') := .(ans1, ans2)]) +test(7.3, nafill(x, 'locf'), c(NA, NA, 0, 1:2)) +test(7.4, nafill(x, 'nocb'), ans1) +test(7.5, setnafill(DT, fill=0, cols=1L), copy(DT)[ , a := ans1]) +test(7.6, setnafill(DT, fill=0, nan=NaN), copy(DT)[ , c('a', 'b') := .(ans1, ans2)]) +test(7.7, nafill(x, fill=0, nan=c(NA, NaN)), error="Argument 'nan' must be length 1") +test(7.8, nafill(x, fill=0, nan=Inf), error="Argument 'nan' must be NA or NaN") diff --git a/src/nafill.c b/src/nafill.c index bc1c99a743..38a83cf909 100644 --- a/src/nafill.c +++ b/src/nafill.c @@ -171,7 +171,7 @@ SEXP nafillR(SEXP obj, SEXP type, SEXP fill, SEXP nan_is_na_arg, SEXP inplace, S nafillInteger64(i64x[i], inx[i], itype, i64fill, &vans[i], bverbose); } else { if (!IS_TRUE_OR_FALSE(nan_is_na_arg)) - error("nan_is_na must be TRUE or FALSE"); + error("nan_is_na must be TRUE or FALSE"); // # nocov bool nan_is_na = LOGICAL(nan_is_na_arg)[0]; nafillDouble(dx[i], inx[i], itype, dfill, nan_is_na, &vans[i], bverbose); } From 7268a9a91e6eea6e33608327a7c2d426c1f8e4a2 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Fri, 22 Nov 2019 09:58:15 +0800 Subject: [PATCH 6/8] more tests --- inst/tests/nafill.Rraw | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/inst/tests/nafill.Rraw b/inst/tests/nafill.Rraw index 41d69b173c..fb724f16d1 100644 --- a/inst/tests/nafill.Rraw +++ b/inst/tests/nafill.Rraw @@ -188,15 +188,17 @@ if (test_bit64) { } # nan argument to treat NaN as NA in nafill, #4020 -x = c(NA, NaN, 0, 1, 2) -ans1 = c(0, 0, 0:2) -ans2 = c(0, NaN, 0:2) +x = c(-1, NA, NaN, 0, 1, 2) +ans1 = c(-1, 0, 0, 0:2) +ans2 = c(-1, 0, NaN, 0:2) DT = data.table(a=x, b=x) -test(7.1, nafill(x, fill=0), ans1) -test(7.2, nafill(x, fill=0, nan=NaN), ans2) -test(7.3, nafill(x, 'locf'), c(NA, NA, 0, 1:2)) -test(7.4, nafill(x, 'nocb'), ans1) -test(7.5, setnafill(DT, fill=0, cols=1L), copy(DT)[ , a := ans1]) -test(7.6, setnafill(DT, fill=0, nan=NaN), copy(DT)[ , c('a', 'b') := .(ans1, ans2)]) -test(7.7, nafill(x, fill=0, nan=c(NA, NaN)), error="Argument 'nan' must be length 1") -test(7.8, nafill(x, fill=0, nan=Inf), error="Argument 'nan' must be NA or NaN") +test(7.01, nafill(x, fill=0), ans1) +test(7.02, nafill(x, fill=0, nan=NaN), ans2) +test(7.03, nafill(x, 'locf'), c(-1, -1, -1, 0:2)) +test(7.04, nafill(x, 'locf', nan=NaN), c(-1, -1, NaN, 0:2)) +test(7.05, nafill(x, 'nocb'), ans1) +test(7.06, nafill(x, 'nocb', nan=NaN), c(-1, NaN, NaN, 0:2)) +test(7.07, setnafill(DT, fill=0, cols=1L), copy(DT)[ , a := ans1]) +test(7.08, setnafill(DT, fill=0, nan=NaN), copy(DT)[ , c('a', 'b') := .(ans1, ans2)]) +test(7.09, nafill(x, fill=0, nan=c(NA, NaN)), error="Argument 'nan' must be length 1") +test(7.10, nafill(x, fill=0, nan=Inf), error="Argument 'nan' must be NA or NaN") From fe97c03a26ebeb4bcfbbef962c6a9e88db713e02 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Fri, 22 Nov 2019 10:52:24 +0800 Subject: [PATCH 7/8] move verbose argument to C only --- NEWS.md | 2 +- R/shift.R | 8 ++++---- inst/tests/nafill.Rraw | 9 +++++---- man/nafill.Rd | 9 ++++----- src/data.table.h | 2 +- src/nafill.c | 20 +++++++++----------- 6 files changed, 24 insertions(+), 26 deletions(-) diff --git a/NEWS.md b/NEWS.md index 40ee2fbdb3..502a207d3f 100644 --- a/NEWS.md +++ b/NEWS.md @@ -8,7 +8,7 @@ 1. `DT[, {...; .(A,B)}]` (i.e. when `.()` is the final item of a multi-statement `{...}`) now auto-names the columns `A` and `B` (just like `DT[, .(A,B)]`) rather than `V1` and `V2`, [#2478](https://github.com/Rdatatable/data.table/issues/2478) [#609](https://github.com/Rdatatable/data.table/issues/609). Similarly, `DT[, if (.N>1) .(B), by=A]` now auto-names the column `B` rather than `V1`. Explicit names are unaffected; e.g. `DT[, {... y= ...; .(A=C+y)}, by=...]` named the column `A` before, and still does. Thanks also to @renkun-ken for his go-first strong testing which caught an issue not caught by the test suite or by revdep testing, related to NULL being the last item, [#4061](https://github.com/Rdatatable/data.table/issues/4061). -2. `nafill` and `setnafill` gain `nan` argument to say whether `NaN` should be considered the same as `NA` for filling purposes, [#4020](https://github.com/Rdatatable/data.table/issues/4020). Prior versions had an implicit value of `nan=NaN`; the default is now `nan=NA`, i.e., `NaN` is treated as if it's missing. Thanks @AnonymousBoba for the suggestion. +2. `nafill` and `setnafill` gain `nan` argument to say whether `NaN` should be considered the same as `NA` for filling purposes, [#4020](https://github.com/Rdatatable/data.table/issues/4020). Prior versions had an implicit value of `nan=NaN`; the default is now `nan=NA`, i.e., `NaN` is treated as if it's missing. Thanks @AnonymousBoba for the suggestion. Also, while `nafill` still respects `getOption('datatable.verbose')`, the `verbose` argument has been removed. ## BUG FIXES diff --git a/R/shift.R b/R/shift.R index 0299af0ff8..63a1cdec42 100644 --- a/R/shift.R +++ b/R/shift.R @@ -24,16 +24,16 @@ shift = function(x, n=1L, fill=NA, type=c("lag", "lead", "shift"), give.names=FA ans } -nafill = function(x, type=c("const","locf","nocb"), fill=NA, nan=NA, verbose=getOption("datatable.verbose")) { +nafill = function(x, type=c("const","locf","nocb"), fill=NA, nan=NA) { type = match.arg(type) if (type!="const" && !missing(fill)) warning("argument 'fill' ignored, only make sense for type='const'") - .Call(CnafillR, x, type, fill, nan_is_na(nan), FALSE, NULL, verbose) + .Call(CnafillR, x, type, fill, nan_is_na(nan), FALSE, NULL) } -setnafill = function(x, type=c("const","locf","nocb"), fill=NA, nan=NA, cols=seq_along(x), verbose=getOption("datatable.verbose")) { +setnafill = function(x, type=c("const","locf","nocb"), fill=NA, nan=NA, cols=seq_along(x)) { type = match.arg(type) if (type!="const" && !missing(fill)) warning("argument 'fill' ignored, only make sense for type='const'") - invisible(.Call(CnafillR, x, type, fill, nan_is_na(nan), TRUE, cols, verbose)) + invisible(.Call(CnafillR, x, type, fill, nan_is_na(nan), TRUE, cols)) } diff --git a/inst/tests/nafill.Rraw b/inst/tests/nafill.Rraw index fb724f16d1..99a404b4d9 100644 --- a/inst/tests/nafill.Rraw +++ b/inst/tests/nafill.Rraw @@ -152,12 +152,13 @@ test(4.24, colnamesInt(dt, "a"), error="has no names") # verbose dt = data.table(a=c(1L, 2L, NA_integer_), b=c(1, 2, NA_real_)) -test(5.01, nafill(dt, "locf", verbose=TRUE), output="nafillInteger: took.*nafillDouble: took.*nafillR.*took") -test(5.02, setnafill(dt, "locf", verbose=TRUE), output="nafillInteger: took.*nafillDouble: took.*nafillR.*took") -test(5.03, nafill(dt, "locf", verbose=NA), error="verbose must be TRUE or FALSE") +old=options(datatable.verbose=TRUE) +test(5.01, nafill(dt, "locf"), output="nafillInteger: took.*nafillDouble: took.*nafillR.*took") +test(5.02, setnafill(dt, "locf"), output="nafillInteger: took.*nafillDouble: took.*nafillR.*took") if (test_bit64) { - test(5.04, nafill(as.integer64(c(NA,2,NA,3)), "locf", verbose=TRUE), as.integer64(c(NA,2,2,3)), output="nafillInteger64: took.*nafillR.*took") + test(5.03, nafill(as.integer64(c(NA,2,NA,3)), "locf"), as.integer64(c(NA,2,2,3)), output="nafillInteger64: took.*nafillR.*took") } +options(old) # coerceFill if (test_bit64) { diff --git a/man/nafill.Rd b/man/nafill.Rd index 6583adcd14..f8afb1dcfa 100644 --- a/man/nafill.Rd +++ b/man/nafill.Rd @@ -10,10 +10,8 @@ Fast fill missing values using constant value, \emph{last observation carried forward} or \emph{next observation carried backward}. } \usage{ -nafill(x, type=c("const","locf","nocb"), fill=NA, nan=NA, - verbose=getOption("datatable.verbose")) -setnafill(x, type=c("const","locf","nocb"), fill=NA, nan=NA, - cols=seq_along(x), verbose=getOption("datatable.verbose")) +nafill(x, type=c("const","locf","nocb"), fill=NA, nan=NA) +setnafill(x, type=c("const","locf","nocb"), fill=NA, nan=NA, cols=seq_along(x)) } \arguments{ \item{x}{ vector, list, data.frame or data.table of numeric columns. } @@ -21,10 +19,11 @@ setnafill(x, type=c("const","locf","nocb"), fill=NA, nan=NA, \item{fill}{ numeric or integer, value to be used to fill when \code{type=="const"}. } \item{nan}{ (numeric \code{x} only) Either \code{NaN} or \code{NA}; if the former, \code{NaN} is treated as distinct from \code{NA}, otherwise, they are treated the same during replacement? } \item{cols}{ numeric or character vector specifying columns to be updated. } - \item{verbose}{ logical, \code{TRUE} turns on timing messages to the console. } } \details{ Only \emph{double} and \emph{integer} data types are currently supported. + + Note that both \code{nafill} and \code{setnafill} provide some verbose output when \code{getOption('datatable.verbose')} is \code{TRUE}. } \value{ A list except when the input is a \code{vector} in which case a \code{vector} is returned. For \code{setnafill} the input argument is returned, updated by reference. diff --git a/src/data.table.h b/src/data.table.h index 8d92fdd935..1279244db2 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -203,7 +203,7 @@ SEXP frollapplyR(SEXP fun, SEXP obj, SEXP k, SEXP fill, SEXP align, SEXP rho); // nafill.c void nafillDouble(double *x, uint_fast64_t nx, unsigned int type, double fill, bool nan_is_na, ans_t *ans, bool verbose); void nafillInteger(int32_t *x, uint_fast64_t nx, unsigned int type, int32_t fill, ans_t *ans, bool verbose); -SEXP nafillR(SEXP obj, SEXP type, SEXP fill, SEXP nan_is_na_arg, SEXP inplace, SEXP cols, SEXP verbose); +SEXP nafillR(SEXP obj, SEXP type, SEXP fill, SEXP nan_is_na_arg, SEXP inplace, SEXP cols); // between.c SEXP between(SEXP x, SEXP lower, SEXP upper, SEXP incbounds, SEXP NAbounds, SEXP check); diff --git a/src/nafill.c b/src/nafill.c index 38a83cf909..4792c9c635 100644 --- a/src/nafill.c +++ b/src/nafill.c @@ -85,11 +85,9 @@ void nafillInteger64(int64_t *x, uint_fast64_t nx, unsigned int type, int64_t fi snprintf(ans->message[0], 500, "%s: took %.3fs\n", __func__, omp_get_wtime()-tic); } -SEXP nafillR(SEXP obj, SEXP type, SEXP fill, SEXP nan_is_na_arg, SEXP inplace, SEXP cols, SEXP verbose) { +SEXP nafillR(SEXP obj, SEXP type, SEXP fill, SEXP nan_is_na_arg, SEXP inplace, SEXP cols) { int protecti=0; - if (!IS_TRUE_OR_FALSE(verbose)) - error("verbose must be TRUE or FALSE"); - bool bverbose = LOGICAL(verbose)[0]; + const bool verbose = GetVerbose(); if (!xlength(obj)) return(obj); @@ -160,7 +158,7 @@ SEXP nafillR(SEXP obj, SEXP type, SEXP fill, SEXP nan_is_na_arg, SEXP inplace, S coerceFill(fill, &dfill, &ifill, &i64fill); double tic=0.0, toc=0.0; - if (bverbose) + if (verbose) tic = omp_get_wtime(); #pragma omp parallel for if (nx>1) num_threads(getDTthreads()) for (R_len_t i=0; i Date: Wed, 18 Dec 2019 14:46:07 +0800 Subject: [PATCH 8/8] merge master & move NEWS item --- NEWS.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 8c13ee998d..3382e22593 100644 --- a/NEWS.md +++ b/NEWS.md @@ -8,6 +8,9 @@ 1. `%chin%` and `chmatch(x, table)` are faster when `x` is length 1, `table` is long, and `x` occurs near the start of `table`. Thanks to Michael Chirico for the suggestion, [#4117](https://github.com/Rdatatable/data.table/pull/4117#discussion_r358378409). +2. `nafill` and `setnafill` gain `nan` argument to say whether `NaN` should be considered the same as `NA` for filling purposes, [#4020](https://github.com/Rdatatable/data.table/issues/4020). Prior versions had an implicit value of `nan=NaN`; the default is now `nan=NA`, i.e., `NaN` is treated as if it's missing. Thanks @AnonymousBoba for the suggestion. Also, while `nafill` still respects `getOption('datatable.verbose')`, the `verbose` argument has been removed. + + ## BUG FIXES 1. A NULL timezone on POSIXct was interpreted by `as.IDate` and `as.ITime` as UTC rather than the session's default timezone (`tz=""`) , [#4085](https://github.com/Rdatatable/data.table/issues/4085). @@ -23,7 +26,6 @@ 1. `DT[, {...; .(A,B)}]` (i.e. when `.()` is the final item of a multi-statement `{...}`) now auto-names the columns `A` and `B` (just like `DT[, .(A,B)]`) rather than `V1` and `V2`, [#2478](https://github.com/Rdatatable/data.table/issues/2478) [#609](https://github.com/Rdatatable/data.table/issues/609). Similarly, `DT[, if (.N>1) .(B), by=A]` now auto-names the column `B` rather than `V1`. Explicit names are unaffected; e.g. `DT[, {... y= ...; .(A=C+y)}, by=...]` named the column `A` before, and still does. Thanks also to @renkun-ken for his go-first strong testing which caught an issue not caught by the test suite or by revdep testing, related to NULL being the last item, [#4061](https://github.com/Rdatatable/data.table/issues/4061). -2. `nafill` and `setnafill` gain `nan` argument to say whether `NaN` should be considered the same as `NA` for filling purposes, [#4020](https://github.com/Rdatatable/data.table/issues/4020). Prior versions had an implicit value of `nan=NaN`; the default is now `nan=NA`, i.e., `NaN` is treated as if it's missing. Thanks @AnonymousBoba for the suggestion. Also, while `nafill` still respects `getOption('datatable.verbose')`, the `verbose` argument has been removed. ## BUG FIXES