Rdatatable · mattdowle · Jun 18, 2019 · May 27, 2019 · May 28, 2019 · May 29, 2019
@@ -53,6 +53,7 @@ export(frollsum)
 export(nafill)
 export(setnafill)
 export(.Last.updated)
+export(coalesce)
 
 S3method("[", data.table)
 S3method("[<-", data.table)

@@ -100,18 +100,35 @@
     # default 4 threads on a laptop with 16GB RAM and 8 logical CPU
 
     ids = as.vector(outer(LETTERS, LETTERS, paste0))
-    system.time(DT1 <- CJ(ids, 1:500000))  # 3.9GB; 340m rows
-    #   user  system elapsed
+    system.time( CJ(ids, 1:500000) )  # 3.9GB; 340m rows
+    #   user  system elapsed (seconds)
     #  3.000   0.817   3.798  # was
     #  1.800   0.832   2.190  # now
 
-    ids = as.factor(ids)
-    system.time(DT2 <- CJ(ids, 1:500000))  # 2.6GB; 340m rows
-    #   user  system elapsed
+    # ids = as.factor(ids)
+    system.time( CJ(ids, 1:500000) )  # 2.6GB; 340m rows
+    #   user  system elapsed (seconds)
     #  1.779   0.534   2.293  # was
     #  0.357   0.763   0.292  # now
     ```
 
+18. New function `coalesce(x, ...)` has been written in C, and is multithreaded for numeric and factor types. It replaces missing values according to a prioritized list of candidates (as per SQL COALESCE, `dplyr::coalesce`, and `hutils::coalesce`), [#3424](https://github.com/Rdatatable/data.table/issues/3424). It accepts any number of vectors in several forms. For example, given three vectors `x`, `y`, and `z`, where each `NA` in `x` is to be replaced by the corresponding value in `y` if that is non-NA, else the corresponding value in `z`, the following equivalent forms are all accepted: `coalesce(x,y,z)`, `coalesce(x,list(y,z))`, and `coalesce(list(x,y,z))`.
+
+    ```R
+    # default 4 threads on a laptop with 16GB RAM and 8 logical CPU
+    N = 100e6
+    x = replicate(5, {x=sample(N); x[sample(N, N/2)]=NA; x}, simplify=FALSE)  # 2GB
+    y1 = do.call(dplyr::coalesce, x))
+    y2 = do.call(hutils::coalesce, x))
+    y3 = do.call(data.table::coalesce, x))
+    #   user  system elapsed (seconds)
+    #  4.935   1.876   6.810  # dplyr::coalesce
+    #  3.122   0.831   3.956  # hutils::coalesce
+    #  0.915   0.099   0.379  # data.table::coalesce
+    identical(y1,y2) && identical(y1,y3)
+    # TRUE
+    ```
+
 
 #### BUG FIXES
 
@@ -172,7 +189,6 @@
     # 2:     2      a
     ```
 
-
 #### NOTES
 
 1. `rbindlist`'s `use.names="check"` now emits its message for automatic column names (`"V[0-9]+"`) too, [#3484](https://github.com/Rdatatable/data.table/pull/3484). See news item 5 of v1.12.2 below.

@@ -0,0 +1,29 @@
+
+coalesce = function(x, ...) {
+  if (missing(..1)) {
+    if (is.list(x)) {
+      if (length(x)<=1L) return(x[[1L]])
+      values = x[-1L]
+      x = x[[1L]]
+    } else return(x)
+  } else {
+    if (!missing(..2)) values = list(...)
+    else values = if (is.list(..1)) ..1 else list(..1)
+  }
+  .Call(Ccoalesce, x, values, FALSE)
+}
+
+setcoalesce = function(x, ...) {
+  if (missing(..1)) {
+    if (is.list(x)) {
+      if (length(x)<=1L) return(x[[1L]])
+      values = x[-1L]
+      x = x[[1L]]
+    } else return(x)
+  } else {
+    if (!missing(..2)) values = list(...)
+    else values = if (is.list(..1)) ..1 else list(..1)
+  }
+  invisible(.Call(Ccoalesce, x, values, TRUE))
+}
+
@@ -43,6 +43,7 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) {
   replace_dot_alias = data.table:::replace_dot_alias
   rollup.data.table = data.table:::rollup.data.table
   selfrefok = data.table:::selfrefok
+  setcoalesce = data.table:::setcoalesce
   setdiff_ = data.table:::setdiff_
   setreordervec = data.table:::setreordervec
   setrev = data.table:::setrev
@@ -15063,6 +15064,131 @@ DT = data.table(a=1)
 test(2059.1, rbindlist(list(DT,1)), error="Item 2 of input is not a data.frame, data.table or list")
 test(2059.2, rbindlist(DT), error="Input is data.table but should be a plain list of items to be stacked")
 
+# coalesce, #3424
+bool = c(TRUE, NA, FALSE)
+bool_val = c(TRUE, TRUE, FALSE)
+int = c(1L, 2L, NA_integer_, 4L)
+int_val = 1:4
+num = c(1, 2, NA_real_, 4)
+num_val = c(1, 2, 3, 4)
+str = c('a', NA_character_, 'b', NA_character_)
+str_val = c('a', 'b', 'b', 'b')
+fkt = factor(str)
+fkt_val = factor(str_val)
+date = as.Date(int, origin="1970-01-01")
+date_val = as.Date(int_val, origin="1970-01-01")
+idate = as.IDate(int, origin="1970-01-01")
+idate_val = as.IDate(int_val, origin="1970-01-01")
+itime = as.ITime(int)
+itime_val = as.ITime(int_val)
+posix = as.POSIXct(int, origin="1970-01-01")
+posix_val = as.POSIXct(int_val, origin="1970-01-01")
+# singleton replacements
+test(2060.001, coalesce(bool, TRUE), bool_val)
+test(2060.002, coalesce(bool, NA, TRUE), bool_val)
+test(2060.003, coalesce(int, 3L), int_val)
+test(2060.004, coalesce(int, NA_integer_, 3L), int_val)
+test(2060.005, coalesce(num, 3), num_val)
+test(2060.006, coalesce(num, NA_real_, 3), num_val)
+test(2060.007, coalesce(str, 'b'), str_val)
+test(2060.008, coalesce(str, NA_character_, 'b'), str_val)
+test(2060.009, coalesce(fkt, factor('b', levels = c('a', 'b'))), fkt_val)
+test(2060.010, coalesce(fkt, factor(NA_integer_, levels=c("a","b")), factor('b', levels = c('a', 'b'))), fkt_val)
+test(2060.011, coalesce(date, as.Date("1970-01-04")), date_val)
+test(2060.012, coalesce(date, as.Date(NA), as.Date("1970-01-04")), date_val)
+test(2060.013, coalesce(idate, as.IDate("1970-01-04")), idate_val)
+test(2060.014, coalesce(idate, as.IDate(NA), as.IDate("1970-01-04")), idate_val)
+test(2060.015, coalesce(itime, as.ITime(3L)), itime_val)
+test(2060.016, coalesce(itime, as.ITime(NA), as.ITime(3L)), itime_val)
+test(2060.017, coalesce(posix, as.POSIXct(3L, origin="1970-01-01")), posix_val)
+test(2060.018, coalesce(posix, as.POSIXct(NA_integer_, origin="1970-01-01"), as.POSIXct(3L, origin="1970-01-01")), posix_val)
+# vector replacements
+test(2060.051, coalesce(bool, rep(TRUE, 3L)), bool_val)
+test(2060.052, coalesce(bool, rep(NA, 3L), rep(TRUE, 3L)), bool_val)
+test(2060.053, coalesce(int, rep(3L, 4L)), int_val)
+test(2060.054, coalesce(int, rep(NA_integer_, 4L), rep(3L, 4L)), int_val)
+test(2060.055, coalesce(num, rep(3, 4L)), num_val)
+test(2060.056, coalesce(num, rep(NA_real_, 4L), rep(3, 4L)), num_val)
+test(2060.057, coalesce(str, rep('b', 4L)), str_val)
+test(2060.058, coalesce(str, rep(NA_character_, 4L), rep('b', 4L)), str_val)
+test(2060.059, coalesce(fkt, factor(rep('b', 4L), levels=c('a', 'b'))), fkt_val)
+test(2060.060, coalesce(fkt, factor(rep(NA_integer_, 4L), levels=c("a","b")), factor(rep('b', 4L), levels=c('a', 'b'))), fkt_val)
+test(2060.061, coalesce(date, rep(as.Date("1970-01-04"), 4L)), date_val)
+test(2060.062, coalesce(date, rep(as.Date(NA), 4L), rep(as.Date("1970-01-04"), 4L)), date_val)
+test(2060.063, coalesce(idate, rep(as.IDate("1970-01-04"), 4L)), idate_val)
+test(2060.064, coalesce(idate, rep(as.IDate(NA), 4L), rep(as.IDate("1970-01-04"), 4L)), idate_val)
+test(2060.065, coalesce(itime, rep(as.ITime(3L), 4L)), itime_val)
+test(2060.066, coalesce(itime, rep(as.ITime(NA), 4L), rep(as.ITime(3L), 4L)), itime_val)
+test(2060.067, coalesce(posix, as.POSIXct(rep(3L, 4L), origin="1970-01-01")), posix_val)
+test(2060.068, coalesce(posix, as.POSIXct(rep(NA_integer_, 4L), origin="1970-01-01"), as.POSIXct(rep(3L, 4L), origin="1970-01-01")), posix_val)
+test(2060.101, coalesce(bool, list(NA, TRUE)), bool_val)
+# floating point extras
+x = c(11L, NA, 13L, NA, 15L, NaN, NA, NA, NA)+0.1
+y = c(NA, 12L, 5L, NA, NA, 16L, NaN, Inf, NA)+0.1
+z = c(11L, NA, 1L, 14L, NA, 16L, 1L, 2L, NA)+0.1
+test(2060.151, coalesce(x, y, z),       ans<-c(11:15,NaN,NaN,Inf,NA)+0.1)
+test(2060.152, coalesce(list(x, y, z)), ans)
+test(2060.153, coalesce(x, list(y,z)),  ans)
+test(2060.154, coalesce(list(x)), x)
+test(2060.155, setcoalesce(list(x)), x)
+test(2060.156, setcoalesce(list(x,y,z)), ans)
+test(2060.157, x, ans)  # setcoalesce updated the first item (x) by reference
+# factor of different levels
+x = factor(c('a','b',NA,NA,'b'))
+y = factor(c('b','b','a',NA,'b'))
+z = factor(c('a',NA,NA,'d','a'))
+test(2060.180, coalesce(x, y, z), error="Item 3 is a factor but its levels are not identical to the first item's levels")
+# edge cases/checks
+test(2060.201, coalesce(bool), bool)
+test(2060.202, coalesce(fkt), fkt)
+test(2060.203, coalesce(bool, 1L), error='Item 2 is type integer but the first item is type logical. Please coerce before coalescing')
+test(2060.204, coalesce(bool, NA_integer_), error='Item 2 is type integer but the first item is type logical.')
+test(2060.205, coalesce(fkt, 1L), error='Item 1 is a factor but item 2 is not a factor. When factors are involved, all items must be factor')
+test(2060.206, coalesce(num, 3L), error='Item 2 is type integer but the first item is type double')
+test(2060.207, coalesce(int, 3), error='Item 2 is type double but the first item is type integer')
+test(2060.208, coalesce(fkt, 'b'), error='Item 1 is a factor but item 2 is not a factor. When factors are involved, all items must be factor.')
+test(2060.209, coalesce(str, factor('b')), error='Item 2 is a factor but item 1 is not a factor. When factors are involved, all items must be factor')
+test(2060.212, coalesce(list(1), list(2)), error="'x' must be an atomic vector")
+test(2060.213, coalesce(bool, c(TRUE, FALSE)), error="Item 2 is length 2 but the first item is length 3. Only singletons are recycled")
+test(2060.214, coalesce(as.raw(0), as.raw(1)), error="Unsupported type: raw")
+test(2060.215, coalesce(bool, list()), bool)
+test(2060.216, coalesce(structure(c(1:2,NA,4L), class=c("a")), c(NA,NA,3L,4L)),, error="Item 2 has a different class than item 1")
+# integer64 tests
+if (test_bit64) {
+  int64 = as.integer64(int)
+  int64_val = as.integer64(1:4)
+  test(2060.301, as.character(coalesce(int64, as.integer64(3))), as.character(int64_val)) # why as.character see nanotime tests below
+  test(2060.302, as.character(coalesce(int64, as.integer64(NA), as.integer64(3))), as.character(int64_val))
+  test(2060.303, as.character(coalesce(int64, as.integer64(rep(3, 4L)))), as.character(int64_val))
+  test(2060.304, coalesce(int64, 1), error='Item 2 has a different class than item 1')
+  test(2060.305, coalesce(int64, 1L), error = 'Item 2 is type integer but the first item is type double')
+}
+# nanotime tests
+if (test_nanotime) {
+  nt = nanotime(int)
+  nt_val = nanotime(1:4)
+  test(2060.401, as.character(coalesce(nt, nanotime(3L))), as.character(nt_val)) # as.character due to eddelbuettel/nanotime#46
+  test(2060.402, as.character(coalesce(nt, nanotime(NA), nanotime(3L))), as.character(nt_val))
+  test(2060.403, as.character(coalesce(nt, nanotime(rep(3, 4L)))), as.character(nt_val))
+  test(2060.404, coalesce(nt, 1), error='Item 2 has a different class than item 1')
+  test(2060.405, coalesce(nt, 1L), error = 'Item 2 is type integer but the first item is type double')
+}
+# setcoalesce
+x = c(11L, NA, 13L, NA, 15L, NA)
+y = c(NA, 12L, 5L, NA, NA, NA)
+z = c(11L, NA, 1L, 14L, NA, NA)
+xx = copy(x)
+xx_addr = address(xx)
+setcoalesce(xx, y, z)
+test(2060.501, xx_addr, address(xx))
+test(2060.502, xx, c(11:15, NA))
+xx = copy(x)
+xx_addr = address(xx)
+setcoalesce(xx, list())
+test(2060.503, xx_addr, address(xx))
+test(2060.504, xx, x)
+test(2060.505, address(setcoalesce(xx)), xx_addr)
+
 
 ###################################
 #  Add new tests above this line  #

@@ -0,0 +1,45 @@
+\name{coalesce}
+\alias{coalesce}
+\alias{setcoalesce}
+\title{ Coalescing missing values }
+\description{
+Fill in missing values in a vector by successively pulling from candidate vectors in order. As per the ANSI SQL function COALESCE, \code{dplyr::coalesce} and \code{hutils::coalesce}.
+Written in C, and multithreaded for numeric and factor types.
+}
+\usage{
+  coalesce(x, \dots)
+}
+\arguments{
+  \item{x}{ An atomic vector. Or, if \dots is missing, x may be a \code{list} of atomic vectors. }
+  \item{\dots}{ Vectors of same class as \code{x} to be used successively as replacements for missing values of \code{x}; singletons are also accepted. }
+}
+\details{
+Factor type is supported only when factor levels are equal.
+}
+\value{
+Atomic vector of same type as \code{x} provided on input, having \code{NA} values replaced by corresponding non-\code{NA} values of arguments provided to \dots.
+}
+\examples{
+x = c(11L, NA, 13L, NA, 15L, NA)
+y = c(NA, 12L, 5L, NA, NA, NA)
+z = c(11L, NA, 1L, 14L, NA, NA)
+coalesce(x, y, z)
+coalesce(list(x,y,z))   # same
+coalesce(x, list(y,z))  # same
+
+\dontrun{
+# default 4 threads on a laptop with 16GB RAM and 8 logical CPU
+N = 100e6
+x = replicate(5, {x=sample(N); x[sample(N, N/2)]=NA; x}, simplify=FALSE)  # 2GB
+system.time(y1 <- do.call(dplyr::coalesce, x))
+system.time(y2 <- do.call(hutils::coalesce, x))
+system.time(y3 <- do.call(data.table::coalesce, x))
+identical(y1,y2) && identical(y1,y3)
+#   user  system elapsed (seconds)
+#  4.935   1.876   6.810  # dplyr v0.8.1
+#  3.122   0.831   3.956  # hutils v1.5.0
+#  0.915   0.099   0.379  # data.table v1.12.4
+}
+}
+\keyword{ data }
+