diff --git a/NEWS.md b/NEWS.md index bc6b599c35..dbef189c0f 100644 --- a/NEWS.md +++ b/NEWS.md @@ -228,6 +228,8 @@ 27. `integer64` sum-by-group is now properly optimized, [#1647](https://github.com/Rdatatable/data.table/issues/1647), [#3464](https://github.com/Rdatatable/data.table/issues/3464). Thanks to @mlandry22-h2o for the report. +28. From v1.12.0 `between()` and `%between%` interpret missing values in `lower=` or `upper=` as unlimited bounds. A new parameter `NAbounds` has been added to achieve the old behaviour of returning `NA`, [#3522](https://github.com/Rdatatable/data.table/issues/3522). Thanks @cguill95 for reporting. This is now consistent for character input, [#3667](https://github.com/Rdatatable/data.table/issues/3667) (thanks @AnonymousBoba), and class `nanotime` is now supported too. + #### NOTES 1. `rbindlist`'s `use.names="check"` now emits its message for automatic column names (`"V[0-9]+"`) too, [#3484](https://github.com/Rdatatable/data.table/pull/3484). See news item 5 of v1.12.2 below. diff --git a/R/between.R b/R/between.R index 97498a97a2..b9588495cb 100644 --- a/R/between.R +++ b/R/between.R @@ -1,27 +1,18 @@ # is x[i] in between lower[i] and upper[i] ? -between = function(x,lower,upper,incbounds=TRUE) { +between = function(x, lower, upper, incbounds=TRUE, NAbounds=TRUE) { if (is.logical(x)) stop("between has been x of type logical") if (is.logical(lower)) lower = as.integer(lower) # typically NA (which is logical type) if (is.logical(upper)) upper = as.integer(upper) # typically NA (which is logical type) is.px = function(x) inherits(x, "POSIXct") - is.i64 = function(x) inherits(x, "integer64") + is.i64 = function(x) inherits(x, "integer64") # this is true for nanotime too # POSIX special handling to auto coerce character - if (is.px(x) && !is.null(tz<-attr(x, "tzone", exact=TRUE)) && nzchar(tz) && - (is.character(lower) || is.character(upper))) { - try_posix_cast = function(x, tz) {tryCatch( - list(status=0L, value=as.POSIXct(x, tz = tz)), - error = function(e) list(status=1L, value=NULL, message=e[["message"]]) - )} - if (is.character(lower)) { - ans = try_posix_cast(lower, tz) - if (ans$status==0L) lower = ans$value - else stop("'between' function the 'x' argument is a POSIX class while 'lower' was not, coercion to POSIX failed with: ", ans$message) - } - if (is.character(upper)) { - ans = try_posix_cast(upper, tz) - if (ans$status==0L) upper = ans$value - else stop("'between' function the 'x' argument is a POSIX class while 'upper' was not, coercion to POSIX failed with: ", ans$message) - } + if (is.px(x) && (is.character(lower) || is.character(upper))) { + tz = attr(x, "tzone", exact=TRUE) + if (is.null(tz)) tz = "" + if (is.character(lower)) lower = tryCatch(as.POSIXct(lower, tz=tz), error=function(e)stop( + "'between' function the 'x' argument is a POSIX class while 'lower' was not, coercion to POSIX failed with: ", e$message)) + if (is.character(upper)) upper = tryCatch(as.POSIXct(upper, tz=tz), error=function(e)stop( + "'between' function the 'x' argument is a POSIX class while 'upper' was not, coercion to POSIX failed with: ", e$message)) stopifnot(is.px(x), is.px(lower), is.px(upper)) # nocov # internal } # POSIX check timezone match @@ -34,23 +25,20 @@ between = function(x,lower,upper,incbounds=TRUE) { stop("'between' function arguments have mismatched timezone attribute, align all arguments to same timezone") } } - # int64 if (is.i64(x)) { if (!requireNamespace("bit64", quietly=TRUE)) stop("trying to use integer64 class when 'bit64' package is not installed") # nocov if (!is.i64(lower) && is.numeric(lower)) lower = bit64::as.integer64(lower) if (!is.i64(upper) && is.numeric(upper)) upper = bit64::as.integer64(upper) - } else if (is.i64(lower) || is.i64(upper)) { - stop("'lower' and/or 'upper' are integer64 class while 'x' argument is not, align classes before passing to 'between'") } - is.supported = function(x) is.numeric(x) || is.px(x) + is.supported = function(x) is.numeric(x) || is.character(x) || is.px(x) if (is.supported(x) && is.supported(lower) && is.supported(upper)) { - # faster parallelised version for int/double. + # faster parallelised version for int/double/character # Cbetween supports length(lower)==1 (recycled) and (from v1.12.0) length(lower)==length(x). # length(upper) can be 1 or length(x) independently of lower - .Call(Cbetween, x, lower, upper, incbounds) + .Call(Cbetween, x, lower, upper, incbounds, NAbounds) } else { if (isTRUE(getOption("datatable.verbose"))) cat("optimised between not available for this data type, fallback to slow R routine\n") - # now just for character input. TODO: support character between in Cbetween and remove this branch + if (isTRUE(NAbounds) && (anyNA(lower) || anyNA(upper))) stop("Not yet implemented NAbounds=TRUE for this non-numeric and non-character type") if (incbounds) x>=lower & x<=upper else x>lower & x= 1L & x <= 5L) test(1529.05, x %between% c(1L, 5L), x >= 1L & x <= 5L) test(1529.06, between(x, 1L, 5L, FALSE), x > 1L & x < 5L) x = runif(15) -test(1529.07, between(x, 0.25, 0.75, TRUE), x >= 0.25 & x <= 0.75) +test(1529.07, between(x, 0.25, 0.75, incbounds=TRUE), x >= 0.25 & x <= 0.75) test(1529.08, x %between% c(0.25, 0.75), x >= 0.25 & x <= 0.75) -test(1529.09, between(x, 0.25, 0.75, FALSE), x > 0.25 & x < 0.75) +test(1529.09, between(x, 0.25, 0.75, incbounds=FALSE), x > 0.25 & x < 0.75) +test(1529.10, between(x, 0.25, NA, NAbounds=NA), ifelse(x<=0.25, FALSE, NA)) +test(1529.11, between(x, NA, 0.75, NAbounds=NA), ifelse(x>=0.75, FALSE, NA)) +test(1529.12, between(x, NA, NA, NAbounds=NA), rep(NA, length(x))) +test(1529.13, between(x, NA, NA, NAbounds=TRUE), rep(TRUE, length(x))) +test(1529.14, between(x, x[3], NA, incbounds=FALSE, NAbounds=NA), ifelse(x<=x[3], FALSE, NA)) +test(1529.15, between(x, x[3], NA, incbounds=TRUE, NAbounds=NA), ifelse(x=x[9], FALSE, NA)) +test(1529.17, between(x, NA, x[9], incbounds=TRUE, NAbounds=NA), ifelse(x>x[9], FALSE, NA)) x = c(NA, runif(15), NA) -test(1529.10, between(x, 0.25, 0.75, TRUE), x >= 0.25 & x <= 0.75) -test(1529.11, x %between% c(0.25, 0.75), x >= 0.25 & x <= 0.75) -test(1529.12, between(x, 0.25, 0.75, FALSE), x > 0.25 & x < 0.75) +test(1529.18, between(x, 0.25, 0.75, incbounds=TRUE), x >= 0.25 & x <= 0.75) +test(1529.19, x %between% c(0.25, 0.75), x >= 0.25 & x <= 0.75) +test(1529.20, between(x, 0.25, 0.75, incbounds=FALSE), x > 0.25 & x < 0.75) # add tests for which.first and which.last # which.first @@ -9441,7 +9449,7 @@ test(1656, nrow(dt[dt, .(x.id, x.a, x.b, x.c, x.row_id, i.id, i.a, i.b, i.c, i.r # between is vectorised, #534 set.seed(1L) -dt = data.table(x=sample(3,10,TRUE), y=sample(2,10,TRUE), z=sample(5,10,TRUE)) +dt = data.table(x=sample(3,10,TRUE), y=sample(2,10,TRUE), z=1L+sample(2,10,TRUE)) test(1657, dt[x %between% list(y,z)], dt[x>=y & x<=z]) # fwrite tests @@ -14657,34 +14665,33 @@ test(2037.2, names(DT), 'a') test(2037.3, unname(table(unlist(strsplit(capture.output(foo(DT)), '\n|\\s+')))['ptr']), 1L) # `between` invalid args, and verbose #3516 -test(2038.01, between(1:5, 2, 4, incbounds=423), error="incbounds must be logical") -test(2038.02, between(1:5, 2, 4, incbounds=NA), error="incbounds must be logical") +test(2038.01, between(1:5, 2, 4, incbounds=423), error="incbounds must be TRUE or FALSE") +test(2038.02, between(1:5, 2, 4, incbounds=NA), error="incbounds must be TRUE or FALSE") old = options(datatable.verbose=TRUE) -test(2038.03, between(1:5, 2L, 4L), output="between parallel processing of integer with recycling took") +test(2038.03, between(1:5, 2L, 4L), output="between parallel processing of integer took") test(2038.04, between(1:5, rep(2L,5L), rep(4L, 5L)), output="between parallel processing of integer took") -test(2038.05, between(as.double(1:5), 2, 4, incbounds=FALSE), output="between parallel processing of double using open bounds with recycling took") -test(2038.06, between(as.double(1:5), 2, 4), output="between parallel processing of double using closed bounds with recycling took") -test(2038.07, between(as.double(1:5), rep(2, 5L), rep(4, 5L)), output="between parallel processing of double took") -test(2038.08, between(c("foo","bar","paz"), "bag", "fog"), output="optimised between not available for this data type, fallback to slow R routine") +test(2038.05, between(as.double(1:5), 2, 4, incbounds=FALSE), output="between parallel processing of double with open bounds took") +test(2038.06, between(as.double(1:5), 2, 4), output="between parallel processing of double with closed bounds took") +test(2038.07, between(as.double(1:5), rep(2, 5L), rep(4, 5L)), output="between parallel processing of double with closed bounds took") +test(2038.08, between(c("foo","bar","paz"), "bag", "fog"), output="between non-parallel processing of character took") # `between` handle POSIXct type x = as.POSIXct("2016-09-18 07:00:00") + 0:10*60*15 dn = as.POSIXct('2016-09-18 08:00:00') up = as.POSIXct('2016-09-18 09:00:00') -test(2038.09, between(x, dn, up), output="between parallel processing of double using closed bounds with recycling took") -test(2038.10, between(x, dn, up, incbounds=FALSE), output="between parallel processing of double using open bounds with recycling took") +test(2038.09, between(x, dn, up), output="between parallel processing of double with closed bounds took") +test(2038.10, between(x, dn, up, incbounds=FALSE), output="between parallel processing of double with open bounds took") -# also handling of string lower/upper bounds _only when x has a time zone_ -x = as.POSIXct("2016-09-18 07:00:00") + 0:10*60*15 +# also handling of string lower/upper bounds +x = as.POSIXct("2016-09-18 07:00:00") + 0:10*60*15 # local time zone dn = '2016-09-18 08:00:00' up = '2016-09-18 09:00:00' -test(2038.11, between(x, dn, up), output = 'optimised between not available') -attr(x, 'tzone') = 'UTC' -test(2038.12, between(x, dn, up), output = 'between parallel processing of double') +test(2038.11, between(x, dn, up), ans<-as.logical(c(0,0,0,0,1,1,1,1,1,0,0)), output='between parallel processing of double with closed bounds') +attr(x, 'tzone') = 'UTC' # this conversion will result in different UTC times depending on which local time zone the test runs in +test(2038.12, between(x, x[5], x[9]), ans, output='between parallel processing of double with closed bounds') # additional flexibility -- cast when one bound is already POSIXct -up = as.POSIXct(up, tz="UTC") -test(2038.13, between(x, dn, up), output = 'between parallel processing of double') +test(2038.13, between(x, as.character(x[5]), x[9]), ans, output='between parallel processing of double') options(old) # exceptions in char to POSIX coercion @@ -14706,45 +14713,46 @@ test(2038.17, X[c %between% list(a, b)], X[c %between% .(a, b)]) # between num to int coercion #3517 old = options("datatable.verbose"=TRUE) -test(2038.18, between(1:5, 2, 4), output="between parallel processing of integer with recycling") -test(2038.19, between(1:5, 2L, 4), output="between parallel processing of integer with recycling") -test(2038.20, between(1:5, 2, 4L), output="between parallel processing of integer with recycling") +test(2038.18, between(1:5, 2, 4), output="between parallel processing of integer") +test(2038.19, between(1:5, 2L, 4), output="between parallel processing of integer") +test(2038.20, between(1:5, 2, 4L), output="between parallel processing of integer") # revdep regression #3565 -test(2038.21, between(1:10, -Inf, Inf), rep(TRUE,10), output="between parallel processing of double using closed bounds with recycling") -test(2038.22, between(as.double(1:10), -Inf, Inf), rep(TRUE,10), output="between parallel processing of double using closed bounds with recycling") +test(2038.21, between(1:10, -Inf, Inf), rep(TRUE,10), output="between parallel processing of double with closed bounds") +test(2038.22, between(as.double(1:10), -Inf, Inf), rep(TRUE,10), output="between parallel processing of double with closed bounds") options(old) # between int64 support if (test_bit64) { as.i64 = bit64::as.integer64 - test(2039.01, between(1:10, as.i64(3), as.i64(6)), error="are integer64 class while 'x' argument is not") - test(2039.02, between(1:10, 3, as.i64(6)), error="are integer64 class while 'x' argument is not") + test(2039.01, between(1:10, as.i64(3), as.i64(6)), error="x is not integer64 but.*Please align classes") + test(2039.02, between(1:10, 3, as.i64(6)), error="x is not integer64 but.*Please align classes") + test(2039.03, between(as.i64(1:3), "2", as.i64(4)), error="x is integer64 but lower and/or upper are not") old = options("datatable.verbose"=TRUE) x = as.i64(1:10) ans36 = c(FALSE,FALSE,TRUE,TRUE,TRUE,TRUE,FALSE,FALSE,FALSE,FALSE) ans36open = c(FALSE,FALSE,FALSE,TRUE,TRUE,FALSE,FALSE,FALSE,FALSE,FALSE) - test(2039.03, between(x, 3, 6), ans36, output="between parallel processing of integer64 with recycling") - test(2039.04, between(x, 3, 6, incbounds=FALSE), ans36open, output="between parallel processing of integer64 with recycling") - test(2039.05, between(x, 3L, 6), ans36, output="between parallel processing of integer64 with recycling") - test(2039.06, between(x, 3L, 6, incbounds=FALSE), ans36open, output="between parallel processing of integer64 with recycling") - test(2039.07, between(x, 3, 6L), ans36, output="between parallel processing of integer64 with recycling") - test(2039.08, between(x, 3, 6L, incbounds=FALSE), ans36open, output="between parallel processing of integer64 with recycling") - test(2039.09, between(x, 3L, 6L), ans36, output="between parallel processing of integer64 with recycling") - test(2039.10, between(x, 3L, 6L, incbounds=FALSE), ans36open, output="between parallel processing of integer64 with recycling") - test(2039.11, between(x, rep(3, 10L), rep(6, 10L)), ans36, output="between parallel processing of integer64 took") - test(2039.12, between(x, rep(3, 10L), rep(6, 10L), incbounds=FALSE), ans36open, output="between parallel processing of integer64 took") + test(2039.04, between(x, 3, 6), ans36, output="between parallel processing of integer64 took") + test(2039.05, between(x, 3, 6, incbounds=FALSE), ans36open, output="between parallel processing of integer64 took") + test(2039.06, between(x, 3L, 6), ans36, output="between parallel processing of integer64 took") + test(2039.07, between(x, 3L, 6, incbounds=FALSE), ans36open, output="between parallel processing of integer64 took") + test(2039.08, between(x, 3, 6L), ans36, output="between parallel processing of integer64 took") + test(2039.09, between(x, 3, 6L, incbounds=FALSE), ans36open, output="between parallel processing of integer64 took") + test(2039.10, between(x, 3L, 6L), ans36, output="between parallel processing of integer64 took") + test(2039.11, between(x, 3L, 6L, incbounds=FALSE), ans36open, output="between parallel processing of integer64 took") + test(2039.12, between(x, rep(3, 10L), rep(6, 10L)), ans36, output="between parallel processing of integer64 took") + test(2039.13, between(x, rep(3, 10L), rep(6, 10L), incbounds=FALSE), ans36open, output="between parallel processing of integer64 took") maxint = 2147483647 - test(2039.13, between(x+maxint, 3+maxint, 6+maxint), ans36, output="between parallel processing of integer64 with recycling") - test(2039.14, between(x+maxint, 3+maxint, 6+maxint, incbounds=FALSE), ans36open, output="between parallel processing of integer64 with recycling") + test(2039.14, between(x+maxint, 3+maxint, 6+maxint), ans36, output="between parallel processing of integer64 took") + test(2039.15, between(x+maxint, 3+maxint, 6+maxint, incbounds=FALSE), ans36open, output="between parallel processing of integer64 took") x[5] = NA ans36[5] = NA ans36open[5] = NA - test(2039.15, between(x+maxint, 3+maxint, 6+maxint), ans36, output="between parallel processing of integer64 with recycling") - test(2039.16, between(x+maxint, 3+maxint, 6+maxint, incbounds=FALSE), ans36open, output="between parallel processing of integer64 with recycling") - test(2039.17, between(x+maxint, NA, 6+maxint), c(TRUE, TRUE, tail(ans36, -2L)), output="between parallel processing of integer64 with recycling") - test(2039.18, between(x+maxint, 3+maxint, NA, incbounds=FALSE), c(head(ans36open, -5L), rep(TRUE, 5)), output="between parallel processing of integer64 with recycling") - test(2039.19, between(x+maxint, rep(NA, 10L), rep(6+maxint, 10L)), c(TRUE, TRUE, tail(ans36, -2L)), output="between parallel processing of integer64 took") - test(2039.20, between(x+maxint, rep(3+maxint, 10L), rep(NA, 10L), incbounds=FALSE), c(head(ans36open, -5L), rep(TRUE, 5)), output="between parallel processing of integer64 took") + test(2039.16, between(x+maxint, 3+maxint, 6+maxint), ans36, output="between parallel processing of integer64 took") + test(2039.17, between(x+maxint, 3+maxint, 6+maxint, incbounds=FALSE), ans36open, output="between parallel processing of integer64 took") + test(2039.18, between(x+maxint, NA, 6+maxint), c(TRUE, TRUE, tail(ans36, -2L)), output="between parallel processing of integer64 took") + test(2039.19, between(x+maxint, 3+maxint, NA, incbounds=FALSE), c(head(ans36open, -5L), rep(TRUE, 5)), output="between parallel processing of integer64 took") + test(2039.20, between(x+maxint, rep(NA, 10L), rep(6+maxint, 10L)), c(TRUE, TRUE, tail(ans36, -2L)), output="between parallel processing of integer64 took") + test(2039.21, between(x+maxint, rep(3+maxint, 10L), rep(NA, 10L), incbounds=FALSE), c(head(ans36open, -5L), rep(TRUE, 5)), output="between parallel processing of integer64 took") options(old) } @@ -15671,6 +15679,57 @@ if (test_bit64) { test(2077.06, int64_int32_match(d[, sum(i32, na.rm=TRUE), g], d[, sum(i64, na.rm=TRUE), g])) } +# between NAs in character bounds #3667 +test(2078.01, letters %between% c("m", NA_character_), c(rep(FALSE, 12L), rep(TRUE, 14L))) +test(2078.02, between(letters, "m", NA_character_, incbounds=TRUE), c(rep(FALSE, 12L), rep(TRUE, 14L))) +test(2078.03, between(letters, "m", NA_character_, incbounds=FALSE), c(rep(FALSE, 13L), rep(TRUE, 13L))) +test(2078.04, between(letters, NA_character_, "m", incbounds=TRUE), c(rep(TRUE, 13L), rep(FALSE, 13L))) +test(2078.05, between(letters, NA_character_, "m", incbounds=FALSE), c(rep(TRUE, 12L), rep(FALSE, 14L))) +test(2078.06, between(letters, NA_character_, NA_character_, incbounds=TRUE), rep(TRUE, 26L)) +test(2078.07, between(letters, NA_character_, NA_character_, incbounds=FALSE), rep(TRUE, 26L)) +test(2078.11, between(c("a","b","c"), c("a","b"), "d"), error="Incompatible vector lengths.*3.*2.*1") +test(2078.12, between(c("a","b","c"), "a", c("b","d")), error="Incompatible vector lengths.*3.*2.*1") +test(2078.13, between(c("a","b","c"), c("a","b"), c("b","d")), error="Incompatible vector lengths.*3.*2.*2") +test(2078.21, between(c("a","c","e"), c("b",NA,"d"), "e", incbounds=TRUE), c(FALSE, TRUE, TRUE)) +test(2078.22, between(c("a","c","e"), c("b",NA,"d"), "e", incbounds=FALSE), c(FALSE, TRUE, FALSE)) +test(2078.23, between(c("a","c","e"), c("b","c","d"), NA, incbounds=TRUE), c(FALSE, TRUE, TRUE)) +test(2078.24, between(c("a","c","e"), c("b","c","d"), NA, incbounds=FALSE), c(FALSE, FALSE, TRUE)) +test(2078.25, between(c("a","c","e"), c("b",NA,"d"), NA, incbounds=TRUE), c(FALSE, TRUE, TRUE)) +test(2078.26, between(c("a","c","e"), c("b",NA,"d"), NA, incbounds=FALSE), c(FALSE, TRUE, TRUE)) +test(2078.27, between(c("a","c","e"), "a", c("b",NA,"d"), incbounds=TRUE), c(TRUE, TRUE, FALSE)) +test(2078.28, between(c("a","c","e"), "a", c("b",NA,"d"), incbounds=FALSE), c(FALSE, TRUE, FALSE)) +test(2078.29, between(c("a","c","e"), NA, c("b",NA,"e"), incbounds=TRUE), c(TRUE, TRUE, TRUE)) +test(2078.30, between(c("a","c","e"), NA, c("b",NA,"e"), incbounds=FALSE), c(TRUE, TRUE, FALSE)) +test(2078.31, between(c("a","c","e"), NA, c("b",NA,"e"), incbounds=TRUE, NAbounds=NA), c(NA, NA, NA)) +test(2078.32, between(c("a","c","e"), NA, c("b",NA,"e"), incbounds=FALSE, NAbounds=NA), c(NA, NA, FALSE)) +# between NA bound as unknown rather than unlimited, #3522 +test(2079.01, between(1:5, 3L, NA, incbounds=TRUE, NAbounds=NA), c(FALSE, FALSE, NA, NA, NA)) +test(2079.02, between(1:5, 3L, NA, incbounds=FALSE, NAbounds=TRUE), c(FALSE, FALSE, FALSE, TRUE, TRUE)) +test(2079.03, between(1:5, 3L, NA, incbounds=FALSE, NAbounds=FALSE), error="NAbounds must be TRUE or NA") +# nanotime support +if (test_nanotime) { + n=nanotime(1:4) + n[2L]=NA + op = options(datatable.verbose=TRUE) + test(2080.01, between(n, nanotime(2), nanotime(10)), c(FALSE, NA, TRUE, TRUE), output="between parallel processing of integer64") + test(2080.02, between(n, nanotime(3), nanotime(10), incbounds=FALSE), c(FALSE, NA, FALSE, TRUE), output="between parallel processing of integer64") + test(2080.03, between(n, nanotime(3), nanotime(NA), incbounds=FALSE, NAbounds=NA), c(FALSE, NA, FALSE, NA), output="between parallel processing of integer64") + options(op) + test(2080.04, between(1:10, nanotime(3), nanotime(6)), error="x is not integer64 but.*Please align classes") + test(2080.05, between(1:10, 3, nanotime(6)), error="x is not integer64 but.*Please align classes") +} +# use raw type to cover fallback to R in between.R +old = options(datatable.verbose=TRUE) +test(2081.01, between(as.raw(1:5), as.raw(2), as.raw(4)), c(FALSE, TRUE, TRUE, TRUE, FALSE), output="fallback to slow R") +test(2081.02, between(as.raw(1:5), as.raw(2), as.raw(4), incbounds=FALSE), c(FALSE, FALSE, TRUE, FALSE, FALSE), output="fallback to slow R") +options(old) +test(2081.03, between(3i, NA, NA), error="Not yet implemented NAbounds=TRUE for this non-numeric and non-character type") +# check that lower<=upper +test(2082.01, between(1:3, INT(2,3,4), 2L), error="Item 2 of lower (3) is greater than item 1 of upper (2)") +test(2082.02, between(as.double(1:3), c(2,3,4), 2), error="Item 2 of lower [(]3.*[)] is greater than item 1 of upper [(]2.*[)]") +if (test_bit64) test(2082.03, between(as.integer64(1:3), as.integer64(2), as.integer64(1)), error="Item 1 of lower (2) is greater than item 1 of upper (1)") +test(2082.04, between(letters[1:2], c("foo","bar"), c("bar")), error="Item 1 of lower ('foo') is greater than item 1 of upper ('bar')") + ################################### # Add new tests above this line # diff --git a/man/between.Rd b/man/between.Rd index f740d0983a..f397b34d4d 100644 --- a/man/between.Rd +++ b/man/between.Rd @@ -7,15 +7,17 @@ \description{ Intended for use in \code{i} in \code{[.data.table}. -\code{between} is equivalent to \code{x >= lower & x <= upper} when -\code{incbounds=TRUE}, or \code{x > lower & y < upper} when \code{FALSE}. With a caveat that -\code{NA} in \code{lower} or \code{upper} are taken as a missing bound and return \code{TRUE} not \code{NA}. +\code{between} is equivalent to \code{lower<=x & x<=upper} when +\code{incbounds=TRUE}, or \code{lowerupper}. \code{inrange} checks whether each value in \code{x} is in between any of the intervals provided in \code{lower,upper}. } \usage{ -between(x, lower, upper, incbounds=TRUE) +between(x, lower, upper, incbounds=TRUE, NAbounds=TRUE) x \%between\% y inrange(x, lower, upper, incbounds=TRUE) @@ -32,6 +34,7 @@ interpreted as \code{lower} and \code{y[[2]]} as \code{upper}.} \item{incbounds}{ \code{TRUE} means inclusive bounds, i.e., [lower,upper]. \code{FALSE} means exclusive bounds, i.e., (lower,upper). It is set to \code{TRUE} by default for infix notations.} +\item{NAbounds}{ If \code{lower} (\code{upper}) contains an \code{NA} what should \code{lower<=x} (\code{x<=upper}) return? By default \code{TRUE} so that a missing bound is interpreted as unlimited. } } \details{ diff --git a/src/between.c b/src/between.c index 8642389a13..344a9c75af 100644 --- a/src/between.c +++ b/src/between.c @@ -1,9 +1,8 @@ #include "data.table.h" -SEXP between(SEXP x, SEXP lower, SEXP upper, SEXP bounds) { - +SEXP between(SEXP x, SEXP lower, SEXP upper, SEXP incbounds, SEXP NAboundsArg) { + int nprotect = 0; R_len_t nx = length(x), nl = length(lower), nu = length(upper); - double tic=0.0; if (!nx || !nl || !nu) return (allocVector(LGLSXP, 0)); const int longest = MAX(MAX(nx, nl), nu); @@ -12,13 +11,15 @@ SEXP between(SEXP x, SEXP lower, SEXP upper, SEXP bounds) { (nx!=1 && nx!=longest)) { error("Incompatible vector lengths: length(x)==%d length(lower)==%d length(upper)==%d. Each should be either length 1 or the length of the longest.", nx, nl, nu); } - if (!isLogical(bounds) || LOGICAL(bounds)[0] == NA_LOGICAL) - error("incbounds must be logical TRUE/FALSE."); + const int longestBound = MAX(nl, nu); + if (!isLogical(incbounds) || LOGICAL(incbounds)[0]==NA_LOGICAL) + error("incbounds must be TRUE or FALSE"); + const bool open = !LOGICAL(incbounds)[0]; + if (!isLogical(NAboundsArg) || LOGICAL(NAboundsArg)[0]==FALSE) + error("NAbounds must be TRUE or NA"); + const bool NAbounds = LOGICAL(NAboundsArg)[0]==TRUE; const bool verbose = GetVerbose(); - int nprotect = 0; - bool integer=true; - bool integer64=false; if (isInteger(x)) { if ((isInteger(lower) || isRealReallyInt(lower)) && (isInteger(upper) || isRealReallyInt(upper))) { // #3517 coerce to num to int when possible @@ -32,133 +33,158 @@ SEXP between(SEXP x, SEXP lower, SEXP upper, SEXP bounds) { x = PROTECT(coerceVector(x, REALSXP)); nprotect++; } } - if (inherits(x,"integer64")) { - if (!inherits(lower,"integer64") || !inherits(upper,"integer64")) - error("Internal error in between: 'x' is integer64 while 'lower' and/or 'upper' are not, should have been caught by now"); // # nocov - integer=false; - integer64=true; - } else if (isReal(x)) { - integer=false; - if (!isReal(lower)) { - lower = PROTECT(coerceVector(lower, REALSXP)); nprotect++; // these coerces will convert NA appropriately - } - if (!isReal(upper)) { - upper = PROTECT(coerceVector(upper, REALSXP)); nprotect++; - } - } else if (!isInteger(x)) { - error("Internal error in between: 'x' is not int, double or int64, should have been caught by now"); // # nocov + if (TYPEOF(lower) != TYPEOF(x)) { + lower = PROTECT(coerceVector(lower, TYPEOF(x))); nprotect++; + } + if (TYPEOF(upper) != TYPEOF(x)) { + upper = PROTECT(coerceVector(upper, TYPEOF(x))); nprotect++; } - // TODO: sweep through lower and upper ensuring lower<=upper (inc bounds) and no lower>upper or lower==INT_MAX const bool recycleX = nx==1; const bool recycleLow = nl==1; const bool recycleUpp = nu==1; - const bool open = !LOGICAL(bounds)[0]; + const int xMask = recycleX ? 0 : INT_MAX; + const int lowMask = recycleLow ? 0 : INT_MAX; + const int uppMask = recycleUpp ? 0 : INT_MAX; SEXP ans = PROTECT(allocVector(LGLSXP, longest)); nprotect++; int *restrict ansp = LOGICAL(ans); - if (integer) { + double tic=omp_get_wtime(); + + switch (TYPEOF(x)) { + case INTSXP: { const int *lp = INTEGER(lower); const int *up = INTEGER(upper); const int *xp = INTEGER(x); - if (!recycleX && recycleLow && recycleUpp) { - const int l = lp[0] + open; // +open so we can always use >= and <=. NA_INTEGER+1 == -INT_MAX == INT_MIN+1 (so NA limit handled by this too) - const int u = up[0]==NA_INTEGER ? INT_MAX : up[0] - open; - if (verbose) tic = omp_get_wtime(); + for (int i=0; iu) + error("Item %d of lower (%d) is greater than item %d of upper (%d)", (i&lowMask)+1, l, (i&uppMask)+1, u); + } + if (NAbounds) { // default NAbounds==TRUE => NA bound means TRUE; i.e. asif lower=-Inf or upper==Inf) #pragma omp parallel for num_threads(getDTthreads()) - for (int i=0; i= and <=. NA_INTEGER+1 == -INT_MAX == INT_MIN+1 (so NA limit handled by this too) } - if (verbose) Rprintf("between parallel processing of integer with recycling took %8.3fs\n", omp_get_wtime()-tic); - } - else { - const int xMask = recycleX ? 0 : INT_MAX; - const int lowMask = recycleLow ? 0 : INT_MAX; - const int uppMask = recycleUpp ? 0 : INT_MAX; - if (verbose) tic = omp_get_wtime(); + } else { #pragma omp parallel for num_threads(getDTthreads()) - for (int i=0; iu-open) || (lok && elemu) + error("Item %d of lower (%lld) is greater than item %d of upper (%lld)", (i&lowMask)+1, l, (i&uppMask)+1, u); + } + if (NAbounds) { #pragma omp parallel for num_threads(getDTthreads()) - for (int i=0; iu-open) || (lok && elemu) + error("Item %d of lower (%f) is greater than item %d of upper (%f)", (i&lowMask)+1, l, (i&uppMask)+1, u); } - if (verbose) Rprintf("between parallel processing of double took %8.3fs\n", omp_get_wtime()-tic); - } - } else { - // type integer64 - const int64_t *lp = (int64_t *)REAL(lower); - const int64_t *up = (int64_t *)REAL(upper); - const int64_t *xp = (int64_t *)REAL(x); - if (!recycleX && recycleLow && recycleUpp) { - const int64_t l = lp[0] + open; // +open as for int32 branch - const int64_t u = up[0]==NA_INTEGER64 ? MAX_INTEGER64 : up[0] - open; - if (verbose) tic = omp_get_wtime(); - #pragma omp parallel for num_threads(getDTthreads()) - for (int i=0; i=u) || (lok && elem<=l)) ? FALSE : NA_LOGICAL; + } + } + if (verbose) Rprintf("between parallel processing of double with open bounds took %8.3fs\n", omp_get_wtime()-tic); + } else { + if (NAbounds) { + #pragma omp parallel for num_threads(getDTthreads()) + for (int i=0; iu) || (lok && elem0) + error("Item %d of lower ('%s') is greater than item %d of upper ('%s')", (i&lowMask)+1, CHAR(l), (i&uppMask)+1, CHAR(u)); + } + if (NAbounds) { + for (int i=0; i