From 9e23c896501d6c2fad673d55790798667c49721a Mon Sep 17 00:00:00 2001 From: Mukul Kumar Date: Fri, 18 Jul 2025 05:01:59 +0530 Subject: [PATCH 1/7] added nan parameter to fcoalesce --- R/wrappers.R | 4 ++-- man/coalesce.Rd | 6 +++++- src/coalesce.c | 12 +++++++----- src/data.table.h | 2 +- 4 files changed, 15 insertions(+), 9 deletions(-) diff --git a/R/wrappers.R b/R/wrappers.R index d4ca3fbc15..b68729a564 100644 --- a/R/wrappers.R +++ b/R/wrappers.R @@ -2,8 +2,8 @@ # Very small (e.g. one line) R functions that just call C. # One file wrappers.R to avoid creating lots of small .R files. -fcoalesce = function(...) .Call(Ccoalesce, list(...), FALSE) -setcoalesce = function(...) .Call(Ccoalesce, list(...), TRUE) +fcoalesce = function(..., nan=NA) .Call(Ccoalesce, list(...), FALSE, nan_is_na(nan)) +setcoalesce = function(..., nan=NA) .Call(Ccoalesce, list(...), TRUE, nan_is_na(nan)) fifelse = function(test, yes, no, na=NA) .Call(CfifelseR, test, yes, no, na) fcase = function(..., default=NA) { diff --git a/man/coalesce.Rd b/man/coalesce.Rd index ebd560f8be..e527c984ff 100644 --- a/man/coalesce.Rd +++ b/man/coalesce.Rd @@ -7,10 +7,11 @@ Fill in missing values in a vector by successively pulling from candidate vector Written in C, and multithreaded for numeric and factor types. } \usage{ - fcoalesce(\dots) + fcoalesce(\dots, nan = NA) } \arguments{ \item{\dots}{ A set of same-class vectors. These vectors can be supplied as separate arguments or as a single plain list, data.table or data.frame, see examples. } + \item{nan}{ (numeric vectors only) Either \code{NaN} or \code{NA}; if \code{NaN}, then \code{NaN} is treated as distinct from \code{NA}, otherwise they are treated the same during replacement. } } \details{ Factor type is supported only when the factor levels of each item are equal. @@ -31,6 +32,9 @@ z = c(11L, NA, 1L, 14L, NA, NA) fcoalesce(x, y, z) fcoalesce(list(x,y,z)) # same fcoalesce(x, list(y,z)) # same +x_num = c(NaN, NA_real_, 3.0) +fcoalesce(x_num, 1) # default: NaN treated as missing -> c(1, 1, 3) +fcoalesce(x_num, 1, nan=NaN) # preserve NaN -> c(NaN, 1, 3) } \keyword{ data } diff --git a/src/coalesce.c b/src/coalesce.c index 02c7fc5494..d7355f623f 100644 --- a/src/coalesce.c +++ b/src/coalesce.c @@ -6,10 +6,12 @@ - The replacement of NAs with non-NA values from subsequent vectors - The conditional checks within parallelized loops */ -SEXP coalesce(SEXP x, SEXP inplaceArg) { +SEXP coalesce(SEXP x, SEXP inplaceArg, SEXP nan_is_na_arg) { if (TYPEOF(x)!=VECSXP) internal_error(__func__, "input is list(...) at R level"); // # nocov if (!IS_TRUE_OR_FALSE(inplaceArg)) internal_error(__func__, "argument 'inplaceArg' must be TRUE or FALSE"); // # nocov + if (!IS_TRUE_OR_FALSE(nan_is_na_arg)) internal_error(__func__, "argument 'nan_is_na_arg' must be TRUE or FALSE"); // # nocov const bool inplace = LOGICAL(inplaceArg)[0]; + const bool nan_is_na = LOGICAL(nan_is_na_arg)[0]; const bool verbose = GetVerbose(); int nprotect = 0; if (length(x)==0 || isNull(VECTOR_ELT(x,0))) return R_NilValue; // coalesce(NULL, "foo") return NULL even though character type mismatches type NULL @@ -106,7 +108,7 @@ SEXP coalesce(SEXP x, SEXP inplaceArg) { SEXP item = VECTOR_ELT(x, j+off); if (length(item)==1) { double tt = REAL(item)[0]; - if (ISNAN(tt)) continue; + if (nan_is_na ? ISNAN(tt) : ISNA(tt)) continue; finalVal = tt; break; } @@ -116,9 +118,9 @@ SEXP coalesce(SEXP x, SEXP inplaceArg) { #pragma omp parallel for num_threads(getDTthreads(nrow, true)) for (int i=0; i Date: Fri, 18 Jul 2025 10:29:08 -0700 Subject: [PATCH 2/7] Style, link ?nafill --- man/coalesce.Rd | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/man/coalesce.Rd b/man/coalesce.Rd index e527c984ff..ed8a5609fa 100644 --- a/man/coalesce.Rd +++ b/man/coalesce.Rd @@ -7,7 +7,7 @@ Fill in missing values in a vector by successively pulling from candidate vector Written in C, and multithreaded for numeric and factor types. } \usage{ - fcoalesce(\dots, nan = NA) + fcoalesce(\dots, nan=NA) } \arguments{ \item{\dots}{ A set of same-class vectors. These vectors can be supplied as separate arguments or as a single plain list, data.table or data.frame, see examples. } @@ -23,7 +23,7 @@ Atomic vector of the same type and length as the first vector, having \code{NA} If the first item is \code{NULL}, the result is \code{NULL}. } \seealso{ - \code{\link{fifelse}} + \code{\link{fifelse}}, \code{\link{nafill}} } \examples{ x = c(11L, NA, 13L, NA, 15L, NA) From e8a25cf9b542134031f394b9cf54ab5189d8ac40 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Fri, 18 Jul 2025 10:30:10 -0700 Subject: [PATCH 3/7] incorporate #7186 insights here too --- man/coalesce.Rd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/man/coalesce.Rd b/man/coalesce.Rd index ed8a5609fa..2fffbb057e 100644 --- a/man/coalesce.Rd +++ b/man/coalesce.Rd @@ -11,7 +11,7 @@ Written in C, and multithreaded for numeric and factor types. } \arguments{ \item{\dots}{ A set of same-class vectors. These vectors can be supplied as separate arguments or as a single plain list, data.table or data.frame, see examples. } - \item{nan}{ (numeric vectors only) Either \code{NaN} or \code{NA}; if \code{NaN}, then \code{NaN} is treated as distinct from \code{NA}, otherwise they are treated the same during replacement. } + \item{nan}{ Either \code{NaN} or \code{NA}; if \code{NaN}, then \code{NaN} is treated as distinct from \code{NA}, otherwise they are treated the same during replacement (double columns only). } } \details{ Factor type is supported only when the factor levels of each item are equal. From 2952472a719445a1f7ba9b3e443b38b32339877b Mon Sep 17 00:00:00 2001 From: Mukul Kumar Date: Mon, 21 Jul 2025 20:26:02 +0530 Subject: [PATCH 4/7] duplicate loop for NA and NAN arg --- inst/tests/tests.Rraw | 3 +++ src/coalesce.c | 53 ++++++++++++++++++++++++++++++------------- 2 files changed, 40 insertions(+), 16 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index aceeb77f89..3c1a4dee07 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -15570,6 +15570,9 @@ test(2060.154, fcoalesce(list(x)), x) test(2060.155, setcoalesce(list(x)), x) test(2060.156, setcoalesce(list(x,y,z)), ans) test(2060.157, x, ans) # setcoalesce updated the first item (x) by reference +# nan parameter, #4567 +test(2060.158, fcoalesce(c(NA_real_, NaN), 0, nan=NA), c(0, NaN)) +test(2060.159, fcoalesce(c(NA_real_, NaN), 0, nan=NaN), c(0, 0)) # factor of different levels x = factor(c('a','b',NA,NA,'b')) y = factor(c('b','b','a',NA,'b')) diff --git a/src/coalesce.c b/src/coalesce.c index d7355f623f..10b7b77576 100644 --- a/src/coalesce.c +++ b/src/coalesce.c @@ -104,23 +104,44 @@ SEXP coalesce(SEXP x, SEXP inplaceArg, SEXP nan_is_na_arg) { } else { double *xP = REAL(first), finalVal=NA_REAL; int k=0; - for (int j=0; j Date: Mon, 21 Jul 2025 20:37:39 +0530 Subject: [PATCH 5/7] tests --- inst/tests/tests.Rraw | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 3c1a4dee07..6316d81528 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -15571,8 +15571,8 @@ test(2060.155, setcoalesce(list(x)), x) test(2060.156, setcoalesce(list(x,y,z)), ans) test(2060.157, x, ans) # setcoalesce updated the first item (x) by reference # nan parameter, #4567 -test(2060.158, fcoalesce(c(NA_real_, NaN), 0, nan=NA), c(0, NaN)) -test(2060.159, fcoalesce(c(NA_real_, NaN), 0, nan=NaN), c(0, 0)) +test(2060.158, fcoalesce(c(NA_real_, NaN), 0, nan=NA), c(0, 0)) +test(2060.159, fcoalesce(c(NA_real_, NaN), 0, nan=NaN), c(0, NaN)) # factor of different levels x = factor(c('a','b',NA,NA,'b')) y = factor(c('b','b','a',NA,'b')) From 92c62280dc84fb940d0cc24a3b6f75c26b4d3cd6 Mon Sep 17 00:00:00 2001 From: Mukul Kumar Date: Mon, 21 Jul 2025 22:26:47 +0530 Subject: [PATCH 6/7] added tests for use of vector replacement also --- inst/tests/tests.Rraw | 2 ++ 1 file changed, 2 insertions(+) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 6316d81528..3996544ebc 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -15573,6 +15573,8 @@ test(2060.157, x, ans) # setcoalesce updated the first item (x) by reference # nan parameter, #4567 test(2060.158, fcoalesce(c(NA_real_, NaN), 0, nan=NA), c(0, 0)) test(2060.159, fcoalesce(c(NA_real_, NaN), 0, nan=NaN), c(0, NaN)) +test(2060.160, fcoalesce(c(NA_real_, NaN), c(1, 2), nan=NA), c(1, 2)) +test(2060.161, fcoalesce(c(NA_real_, NaN), c(1, 2), nan=NaN), c(1, NaN)) # factor of different levels x = factor(c('a','b',NA,NA,'b')) y = factor(c('b','b','a',NA,'b')) From 24c2cdedeaf79af2814bb4dbff94c760177d4532 Mon Sep 17 00:00:00 2001 From: Mukul Kumar Date: Mon, 21 Jul 2025 23:56:02 +0530 Subject: [PATCH 7/7] added news entry --- NEWS.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/NEWS.md b/NEWS.md index eb2fbd047b..a1ac61843b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -46,6 +46,8 @@ 10. `data.table()` and `as.data.table()` with `keep.rownames=TRUE` now extract row names from named vectors, matching `data.frame()` behavior. Names from the first named vector in the input are used to create the row names column (default name `"rn"` or custom name via `keep.rownames="column_name"`), [#1916](https://github.com/Rdatatable/data.table/issues/1916). Thanks to @richierocks for the feature request and @Mukulyadav2004 for the implementation. +11. `fcoalesce()` and `setcoalesce()` gain `nan` argument to control whether `NaN` values should be treated as missing (`nan=NA`, the default) or non-missing (`nan=NaN`), [#4567](https://github.com/Rdatatable/data.table/issues/4567). This provides full compatibility with `nafill()` behavior. Thanks to @ethanbsmith for the feature request and @Mukulyadav2004 for the implementation. + ### BUG FIXES 1. Custom binary operators from the `lubridate` package now work with objects of class `IDate` as with a `Date` subclass, [#6839](https://github.com/Rdatatable/data.table/issues/6839). Thanks @emallickhossain for the report and @aitap for the fix.