From 11422052896fefc70cd8d7a86c8cf981552122a5 Mon Sep 17 00:00:00 2001 From: jangorecki Date: Tue, 30 Jul 2019 13:40:23 +0200 Subject: [PATCH 01/16] between support missing bounds for character input, closes #3667 --- NEWS.md | 2 ++ R/between.R | 19 +++++++++++++++++-- inst/tests/tests.Rraw | 10 ++++++++++ 3 files changed, 29 insertions(+), 2 deletions(-) diff --git a/NEWS.md b/NEWS.md index 411a589857..2a630ce06b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -198,6 +198,8 @@ 24. `column not found` could incorrectly occur in rare non-equi-join cases, [#3635](https://github.com/Rdatatable/data.table/issues/3635). Thanks to @UweBlock for the report. +25. Function `between` and operator `%between%` now handles missing bounds for character input as documented, [#3667](https://github.com/Rdatatable/data.table/issues/3667). Thanks to @AnonymousBoba for the report. + #### NOTES 1. `rbindlist`'s `use.names="check"` now emits its message for automatic column names (`"V[0-9]+"`) too, [#3484](https://github.com/Rdatatable/data.table/pull/3484). See news item 5 of v1.12.2 below. diff --git a/R/between.R b/R/between.R index 97498a97a2..e052f042cd 100644 --- a/R/between.R +++ b/R/between.R @@ -51,8 +51,23 @@ between = function(x,lower,upper,incbounds=TRUE) { } else { if (isTRUE(getOption("datatable.verbose"))) cat("optimised between not available for this data type, fallback to slow R routine\n") # now just for character input. TODO: support character between in Cbetween and remove this branch - if (incbounds) x>=lower & x<=upper - else x>lower & x=lower else x>lower + } else { + stop("internal error in between, all cases of lower/upper NAs for character should be handled already, please report") # nocov + } + } else { + if (incbounds) x>=lower & x<=upper + else x>lower & x Date: Tue, 30 Jul 2019 14:45:22 +0200 Subject: [PATCH 02/16] also support vector bounds for character --- R/between.R | 53 ++++++++++++++++++++++++++++++++++--------- inst/tests/tests.Rraw | 13 +++++++++++ src/between.c | 4 ++-- 3 files changed, 57 insertions(+), 13 deletions(-) diff --git a/R/between.R b/R/between.R index e052f042cd..7d4a52b800 100644 --- a/R/between.R +++ b/R/between.R @@ -51,22 +51,53 @@ between = function(x,lower,upper,incbounds=TRUE) { } else { if (isTRUE(getOption("datatable.verbose"))) cat("optimised between not available for this data type, fallback to slow R routine\n") # now just for character input. TODO: support character between in Cbetween and remove this branch + xlen = length(x) + llen = length(lower) + ulen = length(upper) + if ((llen!=1L && llen!=xlen) || (ulen!=1L && ulen!=xlen)) + stop("lower/upper bounds recycling works only for scalar bound value") # support NAs as missing bounds also for character #3667 lower_na = is.na(lower) upper_na = is.na(upper) - if (lower_na || upper_na) { - if (lower_na && upper_na) { - rep(TRUE, length(x)) - } else if (lower_na) { - if (incbounds) x<=upper else x=lower else x>lower - } else { - stop("internal error in between, all cases of lower/upper NAs for character should be handled already, please report") # nocov - } - } else { + if (!any(c(lower_na, upper_na))) { if (incbounds) x>=lower & x<=upper else x>lower & x=lower else x>lower + } else { + stop("internal error in between, all cases of lower/upper NAs for character should be handled already, please report") # nocov + } + } else { # bounds as vector, slow + ans = logical(length(x)) + if (llen==1L) { + lower = rep(lower, xlen) + lower_na = rep(lower_na, xlen) + } + if (ulen==1L) { + upper = rep(upper, xlen) + upper_na = rep(upper_na, xlen) + } + for (i in seq_len(xlen)) { + if (lower_na[i] && upper_na[i]) { + ans[i] = TRUE + } else if (!lower_na[i] && !upper_na[i]) { + ans[i] = if (incbounds) x[i]>=lower[i] & x[i]<=upper[i] else x[i]>lower[i] & x[i]=lower[i] else x[i]>lower[i] + } else { + stop("internal error in between, missing bound for character, vector bounds, please report") # nocov + } + } + ans + } } } } diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index f2e2eaa1ef..40e590a028 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -15436,6 +15436,19 @@ test(2072.04, between(letters, NA_character_, "m", incbounds=TRUE), c(rep(TRUE, test(2072.05, between(letters, NA_character_, "m", incbounds=FALSE), c(rep(TRUE, 12L), rep(FALSE, 14L))) test(2072.06, between(letters, NA_character_, NA_character_, incbounds=TRUE), rep(TRUE, 26L)) test(2072.07, between(letters, NA_character_, NA_character_, incbounds=FALSE), rep(TRUE, 26L)) +test(2072.11, between(c("a","b","c"), c("a","b"), "d"), error="bounds recycling works only for scalar") +test(2072.12, between(c("a","b","c"), "a", c("b","d")), error="bounds recycling works only for scalar") +test(2072.13, between(c("a","b","c"), c("a","b"), c("b","d")), error="bounds recycling works only for scalar") +test(2072.21, between(c("a","c","e"), c("b",NA,"d"), "e", incbounds=TRUE), c(FALSE, TRUE, TRUE)) +test(2072.22, between(c("a","c","e"), c("b",NA,"d"), "e", incbounds=FALSE), c(FALSE, TRUE, FALSE)) +test(2072.23, between(c("a","c","e"), c("b","c","d"), NA, incbounds=TRUE), c(FALSE, TRUE, TRUE)) +test(2072.24, between(c("a","c","e"), c("b","c","d"), NA, incbounds=FALSE), c(FALSE, FALSE, TRUE)) +test(2072.25, between(c("a","c","e"), c("b",NA,"d"), NA, incbounds=TRUE), c(FALSE, TRUE, TRUE)) +test(2072.26, between(c("a","c","e"), c("b",NA,"d"), NA, incbounds=FALSE), c(FALSE, TRUE, TRUE)) +test(2072.27, between(c("a","c","e"), "a", c("b",NA,"d"), incbounds=TRUE), c(TRUE, TRUE, FALSE)) +test(2072.28, between(c("a","c","e"), "a", c("b",NA,"d"), incbounds=FALSE), c(FALSE, TRUE, FALSE)) +test(2072.29, between(c("a","c","e"), NA, c("b",NA,"d"), incbounds=TRUE), c(TRUE, TRUE, FALSE)) +test(2072.30, between(c("a","c","e"), NA, c("b",NA,"d"), incbounds=FALSE), c(TRUE, TRUE, FALSE)) ################################### diff --git a/src/between.c b/src/between.c index 67ed009f35..b0826f6922 100644 --- a/src/between.c +++ b/src/between.c @@ -44,8 +44,8 @@ SEXP between(SEXP x, SEXP lower, SEXP upper, SEXP bounds) { x = PROTECT(coerceVector(x, REALSXP)); nprotect++; } } - if (inherits(x,"integer64")) { - if (!inherits(lower,"integer64") || !inherits(upper,"integer64")) + if (INHERITS(x,char_integer64)) { + if (!INHERITS(lower,char_integer64) || !INHERITS(upper,char_integer64)) error("Internal error in between: 'x' is integer64 while 'lower' and/or 'upper' are not, should have been caught by now"); // # nocov integer=false; integer64=true; From 2ab6a0b2b629f1cd18c3b3aabf85fbab12e7e988 Mon Sep 17 00:00:00 2001 From: jangorecki Date: Tue, 30 Jul 2019 17:44:38 +0200 Subject: [PATCH 03/16] between add option for NA bounds as unknown rather missing, closes #3522 --- NEWS.md | 2 ++ R/between.R | 9 +++++---- inst/tests/tests.Rraw | 3 +++ man/between.Rd | 4 +++- 4 files changed, 13 insertions(+), 5 deletions(-) diff --git a/NEWS.md b/NEWS.md index 2a630ce06b..672fbb6464 100644 --- a/NEWS.md +++ b/NEWS.md @@ -200,6 +200,8 @@ 25. Function `between` and operator `%between%` now handles missing bounds for character input as documented, [#3667](https://github.com/Rdatatable/data.table/issues/3667). Thanks to @AnonymousBoba for the report. +26. In recent major release function `between` changed behaviour in handling `NA` values for lower/upper bounds. Starting from 1.12.0 missing value has been interpreted as missing bound rather than unknown bound. New argument `NAbounds` has been added to achieve old behaviour. See `between` manual for details. [#3522](https://github.com/Rdatatable/data.table/issues/3522). Thanks to @cguill95 for reporting. + #### NOTES 1. `rbindlist`'s `use.names="check"` now emits its message for automatic column names (`"V[0-9]+"`) too, [#3484](https://github.com/Rdatatable/data.table/pull/3484). See news item 5 of v1.12.2 below. diff --git a/R/between.R b/R/between.R index 7d4a52b800..1af66ab2ca 100644 --- a/R/between.R +++ b/R/between.R @@ -1,5 +1,6 @@ # is x[i] in between lower[i] and upper[i] ? -between = function(x,lower,upper,incbounds=TRUE) { +between = function(x,lower,upper,incbounds=TRUE,NAbounds=FALSE) { + if (!isTRUEorFALSE(NAbounds)) stop("NAbounds must be TRUE or FALSE") if (is.logical(x)) stop("between has been x of type logical") if (is.logical(lower)) lower = as.integer(lower) # typically NA (which is logical type) if (is.logical(upper)) upper = as.integer(upper) # typically NA (which is logical type) @@ -43,13 +44,13 @@ between = function(x,lower,upper,incbounds=TRUE) { stop("'lower' and/or 'upper' are integer64 class while 'x' argument is not, align classes before passing to 'between'") } is.supported = function(x) is.numeric(x) || is.px(x) - if (is.supported(x) && is.supported(lower) && is.supported(upper)) { + if (!NAbounds && is.supported(x) && is.supported(lower) && is.supported(upper)) { # faster parallelised version for int/double. # Cbetween supports length(lower)==1 (recycled) and (from v1.12.0) length(lower)==length(x). # length(upper) can be 1 or length(x) independently of lower .Call(Cbetween, x, lower, upper, incbounds) } else { - if (isTRUE(getOption("datatable.verbose"))) cat("optimised between not available for this data type, fallback to slow R routine\n") + if (isTRUE(getOption("datatable.verbose"))) cat("optimised between not available for this data type or NAbounds argument, fallback to slow R routine\n") # now just for character input. TODO: support character between in Cbetween and remove this branch xlen = length(x) llen = length(lower) @@ -59,7 +60,7 @@ between = function(x,lower,upper,incbounds=TRUE) { # support NAs as missing bounds also for character #3667 lower_na = is.na(lower) upper_na = is.na(upper) - if (!any(c(lower_na, upper_na))) { + if (NAbounds || !any(c(lower_na, upper_na))) { if (incbounds) x>=lower & x<=upper else x>lower & x= lower & x <= upper} when \code{incbounds=TRUE}, or \code{x > lower & y < upper} when \code{FALSE}. With a caveat that \code{NA} in \code{lower} or \code{upper} are taken as a missing bound and return \code{TRUE} not \code{NA}. +This can be changed by setting \code{NAbounds} to \code{TRUE}. \code{inrange} checks whether each value in \code{x} is in between any of the intervals provided in \code{lower,upper}. } \usage{ -between(x, lower, upper, incbounds=TRUE) +between(x, lower, upper, incbounds=TRUE, NAbounds=FALSE) x \%between\% y inrange(x, lower, upper, incbounds=TRUE) @@ -32,6 +33,7 @@ interpreted as \code{lower} and \code{y[[2]]} as \code{upper}.} \item{incbounds}{ \code{TRUE} means inclusive bounds, i.e., [lower,upper]. \code{FALSE} means exclusive bounds, i.e., (lower,upper). It is set to \code{TRUE} by default for infix notations.} +\item{NAbounds}{ \code{TRUE} force to treat \code{NA} bound as \code{NA} rather than missing bound.} } \details{ From bb61d72281e9c388fdcad7b1ac8dd260b85e54b0 Mon Sep 17 00:00:00 2001 From: jangorecki Date: Tue, 30 Jul 2019 17:59:45 +0200 Subject: [PATCH 04/16] amend changes to test for verbose message that got updated --- inst/tests/tests.Rraw | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index ecdeb8e2ca..560d6e886c 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -14651,7 +14651,7 @@ test(2038.04, between(1:5, rep(2L,5L), rep(4L, 5L)), output="between parallel pr test(2038.05, between(as.double(1:5), 2, 4, incbounds=FALSE), output="between parallel processing of double using open bounds with recycling took") test(2038.06, between(as.double(1:5), 2, 4), output="between parallel processing of double using closed bounds with recycling took") test(2038.07, between(as.double(1:5), rep(2, 5L), rep(4, 5L)), output="between parallel processing of double took") -test(2038.08, between(c("foo","bar","paz"), "bag", "fog"), output="optimised between not available for this data type, fallback to slow R routine") +test(2038.08, between(c("foo","bar","paz"), "bag", "fog"), output="optimised between not available") # `between` handle POSIXct type x = as.POSIXct("2016-09-18 07:00:00") + 0:10*60*15 From 4f5809bc7d62b5acf1c8bea5802b18c2f7387599 Mon Sep 17 00:00:00 2001 From: jangorecki Date: Wed, 31 Jul 2019 21:49:25 +0200 Subject: [PATCH 05/16] add tests for full coverage --- inst/tests/tests.Rraw | 1 + 1 file changed, 1 insertion(+) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 560d6e886c..ba5727759f 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -15452,6 +15452,7 @@ test(2072.30, between(c("a","c","e"), NA, c("b",NA,"d"), incbounds=FALSE), c(TRU # between NA bound as unknown rather than missing #3522 test(2073.01, between(1:5, 3L, NA, incbounds=TRUE, NAbounds=TRUE), c(FALSE, FALSE, NA, NA, NA)) test(2073.02, between(1:5, 3L, NA, incbounds=FALSE, NAbounds=TRUE), c(FALSE, FALSE, FALSE, NA, NA)) +test(2073.03, between(1:5, 3L, NA, incbounds=FALSE, NAbounds=NA), error="NAbounds must be TRUE or FALSE") ################################### From eabfe640636358e36de7a250827cd2f13735a650 Mon Sep 17 00:00:00 2001 From: jangorecki Date: Sat, 3 Aug 2019 22:09:46 +0200 Subject: [PATCH 06/16] between gets nanotime support --- NEWS.md | 2 +- R/between.R | 10 ++++++++-- inst/tests/tests.Rraw | 12 ++++++++++++ src/between.c | 12 ++++++------ 4 files changed, 27 insertions(+), 9 deletions(-) diff --git a/NEWS.md b/NEWS.md index 672fbb6464..bde45c6455 100644 --- a/NEWS.md +++ b/NEWS.md @@ -198,7 +198,7 @@ 24. `column not found` could incorrectly occur in rare non-equi-join cases, [#3635](https://github.com/Rdatatable/data.table/issues/3635). Thanks to @UweBlock for the report. -25. Function `between` and operator `%between%` now handles missing bounds for character input as documented, [#3667](https://github.com/Rdatatable/data.table/issues/3667). Thanks to @AnonymousBoba for the report. +25. Function `between` and operator `%between%` now handles missing bounds for character input as documented, [#3667](https://github.com/Rdatatable/data.table/issues/3667). Thanks to @AnonymousBoba for the report. It also now supports `nanotime` class objects. 26. In recent major release function `between` changed behaviour in handling `NA` values for lower/upper bounds. Starting from 1.12.0 missing value has been interpreted as missing bound rather than unknown bound. New argument `NAbounds` has been added to achieve old behaviour. See `between` manual for details. [#3522](https://github.com/Rdatatable/data.table/issues/3522). Thanks to @cguill95 for reporting. diff --git a/R/between.R b/R/between.R index 1af66ab2ca..9c59226aad 100644 --- a/R/between.R +++ b/R/between.R @@ -6,6 +6,7 @@ between = function(x,lower,upper,incbounds=TRUE,NAbounds=FALSE) { if (is.logical(upper)) upper = as.integer(upper) # typically NA (which is logical type) is.px = function(x) inherits(x, "POSIXct") is.i64 = function(x) inherits(x, "integer64") + is.nt = function(x) inherits(x, "nanotime") # POSIX special handling to auto coerce character if (is.px(x) && !is.null(tz<-attr(x, "tzone", exact=TRUE)) && nzchar(tz) && (is.character(lower) || is.character(upper))) { @@ -35,8 +36,13 @@ between = function(x,lower,upper,incbounds=TRUE,NAbounds=FALSE) { stop("'between' function arguments have mismatched timezone attribute, align all arguments to same timezone") } } - # int64 - if (is.i64(x)) { + if (is.nt(x)) { + if (!requireNamespace("nanotime", quietly=TRUE)) stop("trying to use nanotime class when 'nanotime' package is not installed") # nocov + if (!is.nt(lower) && is.numeric(lower)) lower = nanotime::nanotime(lower) + if (!is.nt(upper) && is.numeric(upper)) upper = nanotime::nanotime(upper) + } else if (is.nt(lower) || is.nt(upper)) { + stop("'lower' and/or 'upper' are nanotime class while 'x' argument is not, align classes before passing to 'between'") + } else if (is.i64(x)) { if (!requireNamespace("bit64", quietly=TRUE)) stop("trying to use integer64 class when 'bit64' package is not installed") # nocov if (!is.i64(lower) && is.numeric(lower)) lower = bit64::as.integer64(lower) if (!is.i64(upper) && is.numeric(upper)) upper = bit64::as.integer64(upper) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index ba5727759f..343c6f8cf3 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -15453,6 +15453,18 @@ test(2072.30, between(c("a","c","e"), NA, c("b",NA,"d"), incbounds=FALSE), c(TRU test(2073.01, between(1:5, 3L, NA, incbounds=TRUE, NAbounds=TRUE), c(FALSE, FALSE, NA, NA, NA)) test(2073.02, between(1:5, 3L, NA, incbounds=FALSE, NAbounds=TRUE), c(FALSE, FALSE, FALSE, NA, NA)) test(2073.03, between(1:5, 3L, NA, incbounds=FALSE, NAbounds=NA), error="NAbounds must be TRUE or FALSE") +# nanotime support +if (test_nanotime) { + n=nanotime(1:4) + n[2L]=NA + op = options(datatable.verbose=TRUE) + test(2074.01, between(n, nanotime(2), nanotime(10)), c(FALSE, NA, TRUE, TRUE), output="between parallel processing of integer64") + test(2074.02, between(n, nanotime(3), nanotime(10), incbounds=FALSE), c(FALSE, NA, FALSE, TRUE), output="between parallel processing of integer64") + test(2074.03, between(n, nanotime(3), nanotime(NA), incbounds=FALSE, NAbounds=TRUE), c(FALSE, NA, FALSE, NA), output="optimised between not available for this data type or NAbounds argument") + options(op) + test(2074.04, between(1:10, nanotime(3), nanotime(6)), error="are nanotime class while 'x' argument is not") + test(2074.05, between(1:10, 3, nanotime(6)), error="are nanotime class while 'x' argument is not") +} ################################### diff --git a/src/between.c b/src/between.c index b0826f6922..08492250ec 100644 --- a/src/between.c +++ b/src/between.c @@ -44,9 +44,11 @@ SEXP between(SEXP x, SEXP lower, SEXP upper, SEXP bounds) { x = PROTECT(coerceVector(x, REALSXP)); nprotect++; } } - if (INHERITS(x,char_integer64)) { - if (!INHERITS(lower,char_integer64) || !INHERITS(upper,char_integer64)) + if (INHERITS(x,char_integer64) || INHERITS(x,char_nanotime)) { + if (INHERITS(x,char_integer64) && (!INHERITS(lower,char_integer64) || !INHERITS(upper,char_integer64))) error("Internal error in between: 'x' is integer64 while 'lower' and/or 'upper' are not, should have been caught by now"); // # nocov + if (INHERITS(x,char_nanotime) && (!INHERITS(lower,char_nanotime) || !INHERITS(upper,char_nanotime))) + error("Internal error in between: 'x' is nanotime while 'lower' and/or 'upper' are not, should have been caught by now"); // # nocov integer=false; integer64=true; } else if (isReal(x)) { @@ -98,8 +100,7 @@ SEXP between(SEXP x, SEXP lower, SEXP upper, SEXP bounds) { } if (verbose) Rprintf("between parallel processing of integer took %8.3fs\n", omp_get_wtime()-tic); } - } else if (!integer64) { - // type real + } else if (!integer64) { // type double const double *lp = REAL(lower); const double *up = REAL(upper); const double *xp = REAL(x); @@ -140,8 +141,7 @@ SEXP between(SEXP x, SEXP lower, SEXP upper, SEXP bounds) { } if (verbose) Rprintf("between parallel processing of double took %8.3fs\n", omp_get_wtime()-tic); } - } else { - // type integer64 + } else { // type integer64 or nanotime const int64_t *lp = (int64_t *)REAL(lower); const int64_t *up = (int64_t *)REAL(upper); const int64_t *xp = (int64_t *)REAL(x); From ab69d03449c3030da0ffb149290695d9a94e400f Mon Sep 17 00:00:00 2001 From: mattdowle Date: Wed, 14 Aug 2019 16:23:50 -0700 Subject: [PATCH 07/16] test number increasing after merge --- inst/tests/tests.Rraw | 57 +++++++++++++++++++++---------------------- 1 file changed, 28 insertions(+), 29 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 694fe9adc2..64896d0c43 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -15671,43 +15671,42 @@ if (test_bit64) { test(2077.06, int64_int32_match(d[, sum(i32, na.rm=TRUE), g], d[, sum(i64, na.rm=TRUE), g])) } - # between NAs in character bounds #3667 -test(2072.01, letters %between% c("m", NA_character_), c(rep(FALSE, 12L), rep(TRUE, 14L))) -test(2072.02, between(letters, "m", NA_character_, incbounds=TRUE), c(rep(FALSE, 12L), rep(TRUE, 14L))) -test(2072.03, between(letters, "m", NA_character_, incbounds=FALSE), c(rep(FALSE, 13L), rep(TRUE, 13L))) -test(2072.04, between(letters, NA_character_, "m", incbounds=TRUE), c(rep(TRUE, 13L), rep(FALSE, 13L))) -test(2072.05, between(letters, NA_character_, "m", incbounds=FALSE), c(rep(TRUE, 12L), rep(FALSE, 14L))) -test(2072.06, between(letters, NA_character_, NA_character_, incbounds=TRUE), rep(TRUE, 26L)) -test(2072.07, between(letters, NA_character_, NA_character_, incbounds=FALSE), rep(TRUE, 26L)) -test(2072.11, between(c("a","b","c"), c("a","b"), "d"), error="bounds recycling works only for scalar") -test(2072.12, between(c("a","b","c"), "a", c("b","d")), error="bounds recycling works only for scalar") -test(2072.13, between(c("a","b","c"), c("a","b"), c("b","d")), error="bounds recycling works only for scalar") -test(2072.21, between(c("a","c","e"), c("b",NA,"d"), "e", incbounds=TRUE), c(FALSE, TRUE, TRUE)) -test(2072.22, between(c("a","c","e"), c("b",NA,"d"), "e", incbounds=FALSE), c(FALSE, TRUE, FALSE)) -test(2072.23, between(c("a","c","e"), c("b","c","d"), NA, incbounds=TRUE), c(FALSE, TRUE, TRUE)) -test(2072.24, between(c("a","c","e"), c("b","c","d"), NA, incbounds=FALSE), c(FALSE, FALSE, TRUE)) -test(2072.25, between(c("a","c","e"), c("b",NA,"d"), NA, incbounds=TRUE), c(FALSE, TRUE, TRUE)) -test(2072.26, between(c("a","c","e"), c("b",NA,"d"), NA, incbounds=FALSE), c(FALSE, TRUE, TRUE)) -test(2072.27, between(c("a","c","e"), "a", c("b",NA,"d"), incbounds=TRUE), c(TRUE, TRUE, FALSE)) -test(2072.28, between(c("a","c","e"), "a", c("b",NA,"d"), incbounds=FALSE), c(FALSE, TRUE, FALSE)) -test(2072.29, between(c("a","c","e"), NA, c("b",NA,"d"), incbounds=TRUE), c(TRUE, TRUE, FALSE)) -test(2072.30, between(c("a","c","e"), NA, c("b",NA,"d"), incbounds=FALSE), c(TRUE, TRUE, FALSE)) +test(2078.01, letters %between% c("m", NA_character_), c(rep(FALSE, 12L), rep(TRUE, 14L))) +test(2078.02, between(letters, "m", NA_character_, incbounds=TRUE), c(rep(FALSE, 12L), rep(TRUE, 14L))) +test(2078.03, between(letters, "m", NA_character_, incbounds=FALSE), c(rep(FALSE, 13L), rep(TRUE, 13L))) +test(2078.04, between(letters, NA_character_, "m", incbounds=TRUE), c(rep(TRUE, 13L), rep(FALSE, 13L))) +test(2078.05, between(letters, NA_character_, "m", incbounds=FALSE), c(rep(TRUE, 12L), rep(FALSE, 14L))) +test(2078.06, between(letters, NA_character_, NA_character_, incbounds=TRUE), rep(TRUE, 26L)) +test(2078.07, between(letters, NA_character_, NA_character_, incbounds=FALSE), rep(TRUE, 26L)) +test(2078.11, between(c("a","b","c"), c("a","b"), "d"), error="bounds recycling works only for scalar") +test(2078.12, between(c("a","b","c"), "a", c("b","d")), error="bounds recycling works only for scalar") +test(2078.13, between(c("a","b","c"), c("a","b"), c("b","d")), error="bounds recycling works only for scalar") +test(2078.21, between(c("a","c","e"), c("b",NA,"d"), "e", incbounds=TRUE), c(FALSE, TRUE, TRUE)) +test(2078.22, between(c("a","c","e"), c("b",NA,"d"), "e", incbounds=FALSE), c(FALSE, TRUE, FALSE)) +test(2078.23, between(c("a","c","e"), c("b","c","d"), NA, incbounds=TRUE), c(FALSE, TRUE, TRUE)) +test(2078.24, between(c("a","c","e"), c("b","c","d"), NA, incbounds=FALSE), c(FALSE, FALSE, TRUE)) +test(2078.25, between(c("a","c","e"), c("b",NA,"d"), NA, incbounds=TRUE), c(FALSE, TRUE, TRUE)) +test(2078.26, between(c("a","c","e"), c("b",NA,"d"), NA, incbounds=FALSE), c(FALSE, TRUE, TRUE)) +test(2078.27, between(c("a","c","e"), "a", c("b",NA,"d"), incbounds=TRUE), c(TRUE, TRUE, FALSE)) +test(2078.28, between(c("a","c","e"), "a", c("b",NA,"d"), incbounds=FALSE), c(FALSE, TRUE, FALSE)) +test(2078.29, between(c("a","c","e"), NA, c("b",NA,"d"), incbounds=TRUE), c(TRUE, TRUE, FALSE)) +test(2078.30, between(c("a","c","e"), NA, c("b",NA,"d"), incbounds=FALSE), c(TRUE, TRUE, FALSE)) # between NA bound as unknown rather than missing #3522 -test(2073.01, between(1:5, 3L, NA, incbounds=TRUE, NAbounds=TRUE), c(FALSE, FALSE, NA, NA, NA)) -test(2073.02, between(1:5, 3L, NA, incbounds=FALSE, NAbounds=TRUE), c(FALSE, FALSE, FALSE, NA, NA)) -test(2073.03, between(1:5, 3L, NA, incbounds=FALSE, NAbounds=NA), error="NAbounds must be TRUE or FALSE") +test(2079.01, between(1:5, 3L, NA, incbounds=TRUE, NAbounds=TRUE), c(FALSE, FALSE, NA, NA, NA)) +test(2079.02, between(1:5, 3L, NA, incbounds=FALSE, NAbounds=TRUE), c(FALSE, FALSE, FALSE, NA, NA)) +test(2079.03, between(1:5, 3L, NA, incbounds=FALSE, NAbounds=NA), error="NAbounds must be TRUE or FALSE") # nanotime support if (test_nanotime) { n=nanotime(1:4) n[2L]=NA op = options(datatable.verbose=TRUE) - test(2074.01, between(n, nanotime(2), nanotime(10)), c(FALSE, NA, TRUE, TRUE), output="between parallel processing of integer64") - test(2074.02, between(n, nanotime(3), nanotime(10), incbounds=FALSE), c(FALSE, NA, FALSE, TRUE), output="between parallel processing of integer64") - test(2074.03, between(n, nanotime(3), nanotime(NA), incbounds=FALSE, NAbounds=TRUE), c(FALSE, NA, FALSE, NA), output="optimised between not available for this data type or NAbounds argument") + test(2080.01, between(n, nanotime(2), nanotime(10)), c(FALSE, NA, TRUE, TRUE), output="between parallel processing of integer64") + test(2080.02, between(n, nanotime(3), nanotime(10), incbounds=FALSE), c(FALSE, NA, FALSE, TRUE), output="between parallel processing of integer64") + test(2080.03, between(n, nanotime(3), nanotime(NA), incbounds=FALSE, NAbounds=TRUE), c(FALSE, NA, FALSE, NA), output="optimised between not available for this data type or NAbounds argument") options(op) - test(2074.04, between(1:10, nanotime(3), nanotime(6)), error="are nanotime class while 'x' argument is not") - test(2074.05, between(1:10, 3, nanotime(6)), error="are nanotime class while 'x' argument is not") + test(2080.04, between(1:10, nanotime(3), nanotime(6)), error="are nanotime class while 'x' argument is not") + test(2080.05, between(1:10, 3, nanotime(6)), error="are nanotime class while 'x' argument is not") } From 344ffa00a6dbb1d96d58bb0a83b92f0db67d46bf Mon Sep 17 00:00:00 2001 From: mattdowle Date: Wed, 14 Aug 2019 17:18:00 -0700 Subject: [PATCH 08/16] nanotime inherits from integer64 so we can just deal with the one integer64 case (motivated by coverage) --- R/between.R | 13 +++---------- inst/tests/tests.Rraw | 8 ++++---- 2 files changed, 7 insertions(+), 14 deletions(-) diff --git a/R/between.R b/R/between.R index 9c59226aad..88cd593bb2 100644 --- a/R/between.R +++ b/R/between.R @@ -5,8 +5,7 @@ between = function(x,lower,upper,incbounds=TRUE,NAbounds=FALSE) { if (is.logical(lower)) lower = as.integer(lower) # typically NA (which is logical type) if (is.logical(upper)) upper = as.integer(upper) # typically NA (which is logical type) is.px = function(x) inherits(x, "POSIXct") - is.i64 = function(x) inherits(x, "integer64") - is.nt = function(x) inherits(x, "nanotime") + is.i64 = function(x) inherits(x, "integer64") # this is true for nanotime too # POSIX special handling to auto coerce character if (is.px(x) && !is.null(tz<-attr(x, "tzone", exact=TRUE)) && nzchar(tz) && (is.character(lower) || is.character(upper))) { @@ -36,18 +35,12 @@ between = function(x,lower,upper,incbounds=TRUE,NAbounds=FALSE) { stop("'between' function arguments have mismatched timezone attribute, align all arguments to same timezone") } } - if (is.nt(x)) { - if (!requireNamespace("nanotime", quietly=TRUE)) stop("trying to use nanotime class when 'nanotime' package is not installed") # nocov - if (!is.nt(lower) && is.numeric(lower)) lower = nanotime::nanotime(lower) - if (!is.nt(upper) && is.numeric(upper)) upper = nanotime::nanotime(upper) - } else if (is.nt(lower) || is.nt(upper)) { - stop("'lower' and/or 'upper' are nanotime class while 'x' argument is not, align classes before passing to 'between'") - } else if (is.i64(x)) { + if (is.i64(x)) { if (!requireNamespace("bit64", quietly=TRUE)) stop("trying to use integer64 class when 'bit64' package is not installed") # nocov if (!is.i64(lower) && is.numeric(lower)) lower = bit64::as.integer64(lower) if (!is.i64(upper) && is.numeric(upper)) upper = bit64::as.integer64(upper) } else if (is.i64(lower) || is.i64(upper)) { - stop("'lower' and/or 'upper' are integer64 class while 'x' argument is not, align classes before passing to 'between'") + stop("'lower' and/or 'upper' are integer64 class while 'x' is not. Please align classes before passing to 'between'.") } is.supported = function(x) is.numeric(x) || is.px(x) if (!NAbounds && is.supported(x) && is.supported(lower) && is.supported(upper)) { diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 64896d0c43..09d4940e4a 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -14717,8 +14717,8 @@ options(old) # between int64 support if (test_bit64) { as.i64 = bit64::as.integer64 - test(2039.01, between(1:10, as.i64(3), as.i64(6)), error="are integer64 class while 'x' argument is not") - test(2039.02, between(1:10, 3, as.i64(6)), error="are integer64 class while 'x' argument is not") + test(2039.01, between(1:10, as.i64(3), as.i64(6)), error="are integer64 class while 'x' is not") + test(2039.02, between(1:10, 3, as.i64(6)), error="are integer64 class while 'x' is not") old = options("datatable.verbose"=TRUE) x = as.i64(1:10) ans36 = c(FALSE,FALSE,TRUE,TRUE,TRUE,TRUE,FALSE,FALSE,FALSE,FALSE) @@ -15705,8 +15705,8 @@ if (test_nanotime) { test(2080.02, between(n, nanotime(3), nanotime(10), incbounds=FALSE), c(FALSE, NA, FALSE, TRUE), output="between parallel processing of integer64") test(2080.03, between(n, nanotime(3), nanotime(NA), incbounds=FALSE, NAbounds=TRUE), c(FALSE, NA, FALSE, NA), output="optimised between not available for this data type or NAbounds argument") options(op) - test(2080.04, between(1:10, nanotime(3), nanotime(6)), error="are nanotime class while 'x' argument is not") - test(2080.05, between(1:10, 3, nanotime(6)), error="are nanotime class while 'x' argument is not") + test(2080.04, between(1:10, nanotime(3), nanotime(6)), error="are integer64 class while 'x' is not") + test(2080.05, between(1:10, 3, nanotime(6)), error="are integer64 class while 'x' is not") } From 8565fc1a7041695dcb421a8e10bb2127c2922461 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Wed, 21 Aug 2019 14:10:30 -0700 Subject: [PATCH 09/16] interim --- NEWS.md | 4 +- R/between.R | 64 +++------------ inst/tests/tests.Rraw | 24 +++--- src/between.c | 176 ++++++++++++++++++------------------------ src/data.table.h | 2 +- 5 files changed, 97 insertions(+), 173 deletions(-) diff --git a/NEWS.md b/NEWS.md index 587125963b..156aedd2c2 100644 --- a/NEWS.md +++ b/NEWS.md @@ -228,9 +228,7 @@ 27. `integer64` sum-by-group is now properly optimized, [#1647](https://github.com/Rdatatable/data.table/issues/1647), [#3464](https://github.com/Rdatatable/data.table/issues/3464). Thanks to @mlandry22-h2o for the report. -28. `between()` and operator `%between%` now handle missing bounds for character input as documented, [#3667](https://github.com/Rdatatable/data.table/issues/3667). Thanks to @AnonymousBoba for the report. -They also now support `nanotime` class objects. Further, starting from 1.12.0 missing value has been interpreted as missing bound and returned `TRUE`, rather than unknown bound returning `NA`, [#3522](https://github.com/Rdatatable/data.table/issues/3522). Thanks to @cguill95 for reporting. -New argument `NAbounds` has been added to achieve the old behaviour; see `?between`. +28. From v1.12.0, `between()` and `%between%` interpreted missing values in `lower=` or `upper=` as missing bounds and return `TRUE` rather than unknown bound returning `NA`, [#3522](https://github.com/Rdatatable/data.table/issues/3522) (thanks @cguill95). New argument `NAbound` has been added to achieve the old behaviour; see `?between`. This is now consistent for character input, [#3667](https://github.com/Rdatatable/data.table/issues/3667) (thanks @AnonymousBoba), and class `nanotime` is now supported. #### NOTES diff --git a/R/between.R b/R/between.R index 88cd593bb2..11cac9718a 100644 --- a/R/between.R +++ b/R/between.R @@ -1,6 +1,6 @@ # is x[i] in between lower[i] and upper[i] ? -between = function(x,lower,upper,incbounds=TRUE,NAbounds=FALSE) { - if (!isTRUEorFALSE(NAbounds)) stop("NAbounds must be TRUE or FALSE") +between = function(x, lower, upper, incbounds=TRUE, NAbounds=TRUE) { + if (!isTRUEorNA(NAbounds)) stop("NAbounds must be TRUE or NA") if (is.logical(x)) stop("between has been x of type logical") if (is.logical(lower)) lower = as.integer(lower) # typically NA (which is logical type) if (is.logical(upper)) upper = as.integer(upper) # typically NA (which is logical type) @@ -42,63 +42,17 @@ between = function(x,lower,upper,incbounds=TRUE,NAbounds=FALSE) { } else if (is.i64(lower) || is.i64(upper)) { stop("'lower' and/or 'upper' are integer64 class while 'x' is not. Please align classes before passing to 'between'.") } - is.supported = function(x) is.numeric(x) || is.px(x) - if (!NAbounds && is.supported(x) && is.supported(lower) && is.supported(upper)) { + is.supported = function(x) is.numeric(x) || is.character(x) || is.px(x) + if (is.supported(x) && is.supported(lower) && is.supported(upper)) { # faster parallelised version for int/double. # Cbetween supports length(lower)==1 (recycled) and (from v1.12.0) length(lower)==length(x). # length(upper) can be 1 or length(x) independently of lower - .Call(Cbetween, x, lower, upper, incbounds) + .Call(Cbetween, x, lower, upper, incbounds, NAbounds) } else { - if (isTRUE(getOption("datatable.verbose"))) cat("optimised between not available for this data type or NAbounds argument, fallback to slow R routine\n") - # now just for character input. TODO: support character between in Cbetween and remove this branch - xlen = length(x) - llen = length(lower) - ulen = length(upper) - if ((llen!=1L && llen!=xlen) || (ulen!=1L && ulen!=xlen)) - stop("lower/upper bounds recycling works only for scalar bound value") - # support NAs as missing bounds also for character #3667 - lower_na = is.na(lower) - upper_na = is.na(upper) - if (NAbounds || !any(c(lower_na, upper_na))) { - if (incbounds) x>=lower & x<=upper - else x>lower & x=lower else x>lower - } else { - stop("internal error in between, all cases of lower/upper NAs for character should be handled already, please report") # nocov - } - } else { # bounds as vector, slow - ans = logical(length(x)) - if (llen==1L) { - lower = rep(lower, xlen) - lower_na = rep(lower_na, xlen) - } - if (ulen==1L) { - upper = rep(upper, xlen) - upper_na = rep(upper_na, xlen) - } - for (i in seq_len(xlen)) { - if (lower_na[i] && upper_na[i]) { - ans[i] = TRUE - } else if (!lower_na[i] && !upper_na[i]) { - ans[i] = if (incbounds) x[i]>=lower[i] & x[i]<=upper[i] else x[i]>lower[i] & x[i]=lower[i] else x[i]>lower[i] - } else { - stop("internal error in between, missing bound for character, vector bounds, please report") # nocov - } - } - ans - } - } + if (isTRUE(getOption("datatable.verbose"))) cat("optimised between not available for this data type, fallback to slow R routine\n") + if (isTRUE(NAbounds)) stop("Not yet implemented NAbounds for this non-numeric and non-character type") + if (incbounds) x>=lower & x<=upper + else x>lower & xupper or lower==INT_MAX const bool recycleX = nx==1; const bool recycleLow = nl==1; const bool recycleUpp = nu==1; - const bool open = !LOGICAL(bounds)[0]; + const int xMask = recycleX ? 0 : INT_MAX; + const int lowMask = recycleLow ? 0 : INT_MAX; + const int uppMask = recycleUpp ? 0 : INT_MAX; + const bool open = !LOGICAL(incbounds)[0]; + const bool NAboundsNA = LOGICAL(NAbounds)[0]==NA_INTEGER; SEXP ans = PROTECT(allocVector(LGLSXP, longest)); nprotect++; int *restrict ansp = LOGICAL(ans); - if (integer) { + double tic=omp_get_wtime(); + switch (TYPEOF(x)) { + case INTSXP: { const int *lp = INTEGER(lower); const int *up = INTEGER(upper); const int *xp = INTEGER(x); - if (!recycleX && recycleLow && recycleUpp) { - const int l = lp[0] + open; // +open so we can always use >= and <=. NA_INTEGER+1 == -INT_MAX == INT_MIN+1 (so NA limit handled by this too) - const int u = up[0]==NA_INTEGER ? INT_MAX : up[0] - open; - if (verbose) tic = omp_get_wtime(); - #pragma omp parallel for num_threads(getDTthreads()) - for (int i=0; i= and <=. NA_INTEGER+1 == -INT_MAX == INT_MIN+1 (so NA limit handled by this too) + // one relatively complex condition here (using one ?:, short-circuit '&&', and const bool NAboundsNA (predicted away)) should be + // as fast as specialized extra loops but with simplicity, correctness & coverage benefits. } - else { - const int xMask = recycleX ? 0 : INT_MAX; - const int lowMask = recycleLow ? 0 : INT_MAX; - const int uppMask = recycleUpp ? 0 : INT_MAX; - if (verbose) tic = omp_get_wtime(); + if (verbose) Rprintf("between parallel processing of integer took %8.3fs\n", omp_get_wtime()-tic); + } break; + case REALSXP: + if (INHERITS(x, char_integer64)) { + const int64_t *lp = (int64_t *)REAL(lower); + const int64_t *up = (int64_t *)REAL(upper); + const int64_t *xp = (int64_t *)REAL(x); #pragma omp parallel for num_threads(getDTthreads()) for (int i=0; i Date: Thu, 22 Aug 2019 20:08:34 -0700 Subject: [PATCH 10/16] interim --- R/between.R | 38 ++++++----- inst/tests/tests.Rraw | 59 ++++++++--------- src/between.c | 150 ++++++++++++++++++++++++++---------------- src/data.table.h | 5 +- src/init.c | 19 +----- src/utils.c | 33 ++++++++++ 6 files changed, 182 insertions(+), 122 deletions(-) diff --git a/R/between.R b/R/between.R index 11cac9718a..c24a9492fa 100644 --- a/R/between.R +++ b/R/between.R @@ -7,22 +7,28 @@ between = function(x, lower, upper, incbounds=TRUE, NAbounds=TRUE) { is.px = function(x) inherits(x, "POSIXct") is.i64 = function(x) inherits(x, "integer64") # this is true for nanotime too # POSIX special handling to auto coerce character - if (is.px(x) && !is.null(tz<-attr(x, "tzone", exact=TRUE)) && nzchar(tz) && - (is.character(lower) || is.character(upper))) { - try_posix_cast = function(x, tz) {tryCatch( - list(status=0L, value=as.POSIXct(x, tz = tz)), - error = function(e) list(status=1L, value=NULL, message=e[["message"]]) - )} - if (is.character(lower)) { - ans = try_posix_cast(lower, tz) - if (ans$status==0L) lower = ans$value - else stop("'between' function the 'x' argument is a POSIX class while 'lower' was not, coercion to POSIX failed with: ", ans$message) - } - if (is.character(upper)) { - ans = try_posix_cast(upper, tz) - if (ans$status==0L) upper = ans$value - else stop("'between' function the 'x' argument is a POSIX class while 'upper' was not, coercion to POSIX failed with: ", ans$message) - } + if (is.px(x) && (is.character(lower) || is.character(upper))) { + tz = attr(x, "tzone", exact=TRUE) + if (is.null(tz)) tz = "" + if (is.character(lower)) lower = tryCatch(as.POSIXct(lower, tz=tz), error=function(e)stop( + "'between' function the 'x' argument is a POSIX class while 'lower' was not, coercion to POSIX failed with: ", e$message)) + if (is.character(upper)) upper = tryCatch(as.POSIXct(upper, tz=tz), error=function(e)stop( + "'between' function the 'x' argument is a POSIX class while 'upper' was not, coercion to POSIX failed with: ", e$message)) + + # try_posix_cast = function(x, tz) {tryCatch( + # list(status=0L, value=as.POSIXct(x, tz = tz)), + # error = function(e) list(status=1L, value=NULL, message=e[["message"]]) + # )} + # if (is.character(lower)) { + # ans = try_posix_cast(lower, tz) + # if (ans$status==0L) lower = ans$value + # else stop("'between' function the 'x' argument is a POSIX class while 'lower' was not, coercion to POSIX failed with: ", ans$message) + # } + # if (is.character(upper)) { + # ans = try_posix_cast(upper, tz) + # if (ans$status==0L) upper = ans$value + # else stop("'between' function the 'x' argument is a POSIX class while 'upper' was not, coercion to POSIX failed with: ", ans$message) + # } stopifnot(is.px(x), is.px(lower), is.px(upper)) # nocov # internal } # POSIX check timezone match diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 8f3f931d3e..0fdacd80a5 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -14674,17 +14674,16 @@ up = as.POSIXct('2016-09-18 09:00:00') test(2038.09, between(x, dn, up), output="between parallel processing of double with closed bounds took") test(2038.10, between(x, dn, up, incbounds=FALSE), output="between parallel processing of double with open bounds took") -# also handling of string lower/upper bounds _only when x has a time zone_ -x = as.POSIXct("2016-09-18 07:00:00") + 0:10*60*15 +# also handling of string lower/upper bounds +x = as.POSIXct("2016-09-18 07:00:00") + 0:10*60*15 # local time zone dn = '2016-09-18 08:00:00' up = '2016-09-18 09:00:00' -test(2038.11, between(x, dn, up), output = 'optimised between not available') -attr(x, 'tzone') = 'UTC' -test(2038.12, between(x, dn, up), output = 'between parallel processing of double') +test(2038.11, between(x, dn, up), ans<-as.logical(c(0,0,0,0,1,1,1,1,1,0,0)), output='between parallel processing of double with closed bounds') +attr(x, 'tzone') = 'UTC' # this conversion will result in different UTC times depending on which local time zone the test runs in +test(2038.12, between(x, x[5], x[9]), ans, output='between parallel processing of double with closed bounds') # additional flexibility -- cast when one bound is already POSIXct -up = as.POSIXct(up, tz="UTC") -test(2038.13, between(x, dn, up), output = 'between parallel processing of double') +test(2038.13, between(x, as.character(x[5]), x[9]), ans, output='between parallel processing of double') options(old) # exceptions in char to POSIX coercion @@ -14710,8 +14709,8 @@ test(2038.18, between(1:5, 2, 4), output="between parallel processing of integer test(2038.19, between(1:5, 2L, 4), output="between parallel processing of integer") test(2038.20, between(1:5, 2, 4L), output="between parallel processing of integer") # revdep regression #3565 -test(2038.21, between(1:10, -Inf, Inf), rep(TRUE,10), output="between parallel processing of double using closed bounds") -test(2038.22, between(as.double(1:10), -Inf, Inf), rep(TRUE,10), output="between parallel processing of double using closed bounds") +test(2038.21, between(1:10, -Inf, Inf), rep(TRUE,10), output="between parallel processing of double with closed bounds") +test(2038.22, between(as.double(1:10), -Inf, Inf), rep(TRUE,10), output="between parallel processing of double with closed bounds") options(old) # between int64 support @@ -14723,26 +14722,26 @@ if (test_bit64) { x = as.i64(1:10) ans36 = c(FALSE,FALSE,TRUE,TRUE,TRUE,TRUE,FALSE,FALSE,FALSE,FALSE) ans36open = c(FALSE,FALSE,FALSE,TRUE,TRUE,FALSE,FALSE,FALSE,FALSE,FALSE) - test(2039.03, between(x, 3, 6), ans36, output="between parallel processing of integer64 with recycling") - test(2039.04, between(x, 3, 6, incbounds=FALSE), ans36open, output="between parallel processing of integer64 with recycling") - test(2039.05, between(x, 3L, 6), ans36, output="between parallel processing of integer64 with recycling") - test(2039.06, between(x, 3L, 6, incbounds=FALSE), ans36open, output="between parallel processing of integer64 with recycling") - test(2039.07, between(x, 3, 6L), ans36, output="between parallel processing of integer64 with recycling") - test(2039.08, between(x, 3, 6L, incbounds=FALSE), ans36open, output="between parallel processing of integer64 with recycling") - test(2039.09, between(x, 3L, 6L), ans36, output="between parallel processing of integer64 with recycling") - test(2039.10, between(x, 3L, 6L, incbounds=FALSE), ans36open, output="between parallel processing of integer64 with recycling") + test(2039.03, between(x, 3, 6), ans36, output="between parallel processing of integer64 took") + test(2039.04, between(x, 3, 6, incbounds=FALSE), ans36open, output="between parallel processing of integer64 took") + test(2039.05, between(x, 3L, 6), ans36, output="between parallel processing of integer64 took") + test(2039.06, between(x, 3L, 6, incbounds=FALSE), ans36open, output="between parallel processing of integer64 took") + test(2039.07, between(x, 3, 6L), ans36, output="between parallel processing of integer64 took") + test(2039.08, between(x, 3, 6L, incbounds=FALSE), ans36open, output="between parallel processing of integer64 took") + test(2039.09, between(x, 3L, 6L), ans36, output="between parallel processing of integer64 took") + test(2039.10, between(x, 3L, 6L, incbounds=FALSE), ans36open, output="between parallel processing of integer64 took") test(2039.11, between(x, rep(3, 10L), rep(6, 10L)), ans36, output="between parallel processing of integer64 took") test(2039.12, between(x, rep(3, 10L), rep(6, 10L), incbounds=FALSE), ans36open, output="between parallel processing of integer64 took") maxint = 2147483647 - test(2039.13, between(x+maxint, 3+maxint, 6+maxint), ans36, output="between parallel processing of integer64 with recycling") - test(2039.14, between(x+maxint, 3+maxint, 6+maxint, incbounds=FALSE), ans36open, output="between parallel processing of integer64 with recycling") + test(2039.13, between(x+maxint, 3+maxint, 6+maxint), ans36, output="between parallel processing of integer64 took") + test(2039.14, between(x+maxint, 3+maxint, 6+maxint, incbounds=FALSE), ans36open, output="between parallel processing of integer64 took") x[5] = NA ans36[5] = NA ans36open[5] = NA - test(2039.15, between(x+maxint, 3+maxint, 6+maxint), ans36, output="between parallel processing of integer64 with recycling") - test(2039.16, between(x+maxint, 3+maxint, 6+maxint, incbounds=FALSE), ans36open, output="between parallel processing of integer64 with recycling") - test(2039.17, between(x+maxint, NA, 6+maxint), c(TRUE, TRUE, tail(ans36, -2L)), output="between parallel processing of integer64 with recycling") - test(2039.18, between(x+maxint, 3+maxint, NA, incbounds=FALSE), c(head(ans36open, -5L), rep(TRUE, 5)), output="between parallel processing of integer64 with recycling") + test(2039.15, between(x+maxint, 3+maxint, 6+maxint), ans36, output="between parallel processing of integer64 took") + test(2039.16, between(x+maxint, 3+maxint, 6+maxint, incbounds=FALSE), ans36open, output="between parallel processing of integer64 took") + test(2039.17, between(x+maxint, NA, 6+maxint), c(TRUE, TRUE, tail(ans36, -2L)), output="between parallel processing of integer64 took") + test(2039.18, between(x+maxint, 3+maxint, NA, incbounds=FALSE), c(head(ans36open, -5L), rep(TRUE, 5)), output="between parallel processing of integer64 took") test(2039.19, between(x+maxint, rep(NA, 10L), rep(6+maxint, 10L)), c(TRUE, TRUE, tail(ans36, -2L)), output="between parallel processing of integer64 took") test(2039.20, between(x+maxint, rep(3+maxint, 10L), rep(NA, 10L), incbounds=FALSE), c(head(ans36open, -5L), rep(TRUE, 5)), output="between parallel processing of integer64 took") options(old) @@ -15679,9 +15678,9 @@ test(2078.04, between(letters, NA_character_, "m", incbounds=TRUE), c(rep(TRUE, test(2078.05, between(letters, NA_character_, "m", incbounds=FALSE), c(rep(TRUE, 12L), rep(FALSE, 14L))) test(2078.06, between(letters, NA_character_, NA_character_, incbounds=TRUE), rep(TRUE, 26L)) test(2078.07, between(letters, NA_character_, NA_character_, incbounds=FALSE), rep(TRUE, 26L)) -test(2078.11, between(c("a","b","c"), c("a","b"), "d"), error="bounds recycling works only for scalar") -test(2078.12, between(c("a","b","c"), "a", c("b","d")), error="bounds recycling works only for scalar") -test(2078.13, between(c("a","b","c"), c("a","b"), c("b","d")), error="bounds recycling works only for scalar") +test(2078.11, between(c("a","b","c"), c("a","b"), "d"), error="Incompatible vector lengths.*3.*2.*1") +test(2078.12, between(c("a","b","c"), "a", c("b","d")), error="Incompatible vector lengths.*3.*2.*1") +test(2078.13, between(c("a","b","c"), c("a","b"), c("b","d")), error="Incompatible vector lengths.*3.*2.*2") test(2078.21, between(c("a","c","e"), c("b",NA,"d"), "e", incbounds=TRUE), c(FALSE, TRUE, TRUE)) test(2078.22, between(c("a","c","e"), c("b",NA,"d"), "e", incbounds=FALSE), c(FALSE, TRUE, FALSE)) test(2078.23, between(c("a","c","e"), c("b","c","d"), NA, incbounds=TRUE), c(FALSE, TRUE, TRUE)) @@ -15693,9 +15692,9 @@ test(2078.28, between(c("a","c","e"), "a", c("b",NA,"d"), incbounds=FALSE), c(FA test(2078.29, between(c("a","c","e"), NA, c("b",NA,"d"), incbounds=TRUE), c(TRUE, TRUE, FALSE)) test(2078.30, between(c("a","c","e"), NA, c("b",NA,"d"), incbounds=FALSE), c(TRUE, TRUE, FALSE)) # between NA bound as unknown rather than missing #3522 -test(2079.01, between(1:5, 3L, NA, incbounds=TRUE, NAbounds=TRUE), c(FALSE, FALSE, NA, NA, NA)) -test(2079.02, between(1:5, 3L, NA, incbounds=FALSE, NAbounds=TRUE), c(FALSE, FALSE, FALSE, NA, NA)) -test(2079.03, between(1:5, 3L, NA, incbounds=FALSE, NAbounds=NA), error="NAbounds must be TRUE or FALSE") +test(2079.01, between(1:5, 3L, NA, incbounds=TRUE, NAbounds=NA), c(FALSE, FALSE, NA, NA, NA)) +test(2079.02, between(1:5, 3L, NA, incbounds=FALSE, NAbounds=TRUE), c(FALSE, FALSE, FALSE, TRUE, TRUE)) +test(2079.03, between(1:5, 3L, NA, incbounds=FALSE, NAbounds=FALSE), error="NAbounds must be TRUE or NA") # nanotime support if (test_nanotime) { n=nanotime(1:4) @@ -15703,7 +15702,7 @@ if (test_nanotime) { op = options(datatable.verbose=TRUE) test(2080.01, between(n, nanotime(2), nanotime(10)), c(FALSE, NA, TRUE, TRUE), output="between parallel processing of integer64") test(2080.02, between(n, nanotime(3), nanotime(10), incbounds=FALSE), c(FALSE, NA, FALSE, TRUE), output="between parallel processing of integer64") - test(2080.03, between(n, nanotime(3), nanotime(NA), incbounds=FALSE, NAbounds=TRUE), c(FALSE, NA, FALSE, NA), output="optimised between not available for this data type or NAbounds argument") + test(2080.03, between(n, nanotime(3), nanotime(NA), incbounds=FALSE, NAbounds=NA), c(FALSE, NA, FALSE, NA), output="between parallel processing of integer64") options(op) test(2080.04, between(1:10, nanotime(3), nanotime(6)), error="are integer64 class while 'x' is not") test(2080.05, between(1:10, 3, nanotime(6)), error="are integer64 class while 'x' is not") diff --git a/src/between.c b/src/between.c index 5074f8deaf..3ce67a1988 100644 --- a/src/between.c +++ b/src/between.c @@ -1,7 +1,6 @@ #include "data.table.h" -SEXP between(SEXP x, SEXP lower, SEXP upper, SEXP incbounds, SEXP NAbounds) { - +SEXP between(SEXP x, SEXP lower, SEXP upper, SEXP incbounds, SEXP NAboundsArg) { int nprotect = 0; R_len_t nx = length(x), nl = length(lower), nu = length(upper); if (!nx || !nl || !nu) @@ -14,12 +13,12 @@ SEXP between(SEXP x, SEXP lower, SEXP upper, SEXP incbounds, SEXP NAbounds) { } if (!isLogical(incbounds) || LOGICAL(incbounds)[0]==NA_LOGICAL) error("incbounds must be logical TRUE or FALSE"); - if (!isLogical(NAbounds) || LOGICAL(NAbounds)[0]==FALSE) + const bool open = !LOGICAL(incbounds)[0]; + if (!isLogical(NAboundsArg) || LOGICAL(NAboundsArg)[0]==FALSE) error("NAbounds must be logical TRUE or NA"); + const bool NAbounds = LOGICAL(NAboundsArg)[0]==TRUE; const bool verbose = GetVerbose(); - //bool integer=true; - //bool integer64=false; if (isInteger(x)) { if ((isInteger(lower) || isRealReallyInt(lower)) && (isInteger(upper) || isRealReallyInt(upper))) { // #3517 coerce to num to int when possible @@ -32,21 +31,16 @@ SEXP between(SEXP x, SEXP lower, SEXP upper, SEXP incbounds, SEXP NAbounds) { } else { // #3565 x = PROTECT(coerceVector(x, REALSXP)); nprotect++; } - } else if (isReal(x)) { - if (INHERITS(x,char_integer64) || INHERITS(x,char_nanotime)) { - if (INHERITS(x,char_integer64) && (!INHERITS(lower,char_integer64) || !INHERITS(upper,char_integer64))) - error("Internal error in between: 'x' is integer64 while 'lower' and/or 'upper' are not, should have been caught by now"); // # nocov - if (INHERITS(x,char_nanotime) && (!INHERITS(lower,char_nanotime) || !INHERITS(upper,char_nanotime))) - error("Internal error in between: 'x' is nanotime while 'lower' and/or 'upper' are not, should have been caught by now"); // # nocov - } - if (!isReal(lower)) { - lower = PROTECT(coerceVector(lower, REALSXP)); nprotect++; // these coerces will convert NA appropriately - } - if (!isReal(upper)) { - upper = PROTECT(coerceVector(upper, REALSXP)); nprotect++; - } } - // TODO: sweep through lower and upper ensuring lower<=upper (inc bounds) and no lower>upper or lower==INT_MAX + if (TYPEOF(lower) != TYPEOF(x)) { + lower = PROTECT(coerceVector(lower, TYPEOF(x))); nprotect++; + } + if (TYPEOF(upper) != TYPEOF(x)) { + upper = PROTECT(coerceVector(upper, TYPEOF(x))); nprotect++; + } + + // *** TODO ***: sweep through lower and upper (inside cases below due to integer64) + // ensuring lower<=upper (inc bounds) and no lower>upper or lower==INT_MAX const bool recycleX = nx==1; const bool recycleLow = nl==1; @@ -54,78 +48,120 @@ SEXP between(SEXP x, SEXP lower, SEXP upper, SEXP incbounds, SEXP NAbounds) { const int xMask = recycleX ? 0 : INT_MAX; const int lowMask = recycleLow ? 0 : INT_MAX; const int uppMask = recycleUpp ? 0 : INT_MAX; - const bool open = !LOGICAL(incbounds)[0]; - const bool NAboundsNA = LOGICAL(NAbounds)[0]==NA_INTEGER; SEXP ans = PROTECT(allocVector(LGLSXP, longest)); nprotect++; int *restrict ansp = LOGICAL(ans); double tic=omp_get_wtime(); + switch (TYPEOF(x)) { case INTSXP: { const int *lp = INTEGER(lower); const int *up = INTEGER(upper); const int *xp = INTEGER(x); - #pragma omp parallel for num_threads(getDTthreads()) - for (int i=0; i= and <=. NA_INTEGER+1 == -INT_MAX == INT_MIN+1 (so NA limit handled by this too) - // one relatively complex condition here (using one ?:, short-circuit '&&', and const bool NAboundsNA (predicted away)) should be - // as fast as specialized extra loops but with simplicity, correctness & coverage benefits. + if (NAbounds) { // default NAbounds==TRUE => NA bound means TRUE; i.e. asif lower=-Inf or upper==Inf) + #pragma omp parallel for num_threads(getDTthreads()) + for (int i=0; i= and <=. NA_INTEGER+1 == -INT_MAX == INT_MIN+1 (so NA limit handled by this too) + } + } else { + #pragma omp parallel for num_threads(getDTthreads()) + for (int i=0; iu-open) || (lok && elemu-open) || (lok && elem=u) || (lok && elem<=l)) ? FALSE : NA_LOGICAL; + } } if (verbose) Rprintf("between parallel processing of double with open bounds took %8.3fs\n", omp_get_wtime()-tic); } else { - #pragma omp parallel for num_threads(getDTthreads()) - for (int i=0; iu) || (lok && elem Date: Thu, 22 Aug 2019 20:16:00 -0700 Subject: [PATCH 11/16] interim --- R/between.R | 2 -- src/between.c | 4 ++-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/R/between.R b/R/between.R index c24a9492fa..2e96eb1861 100644 --- a/R/between.R +++ b/R/between.R @@ -45,8 +45,6 @@ between = function(x, lower, upper, incbounds=TRUE, NAbounds=TRUE) { if (!requireNamespace("bit64", quietly=TRUE)) stop("trying to use integer64 class when 'bit64' package is not installed") # nocov if (!is.i64(lower) && is.numeric(lower)) lower = bit64::as.integer64(lower) if (!is.i64(upper) && is.numeric(upper)) upper = bit64::as.integer64(upper) - } else if (is.i64(lower) || is.i64(upper)) { - stop("'lower' and/or 'upper' are integer64 class while 'x' is not. Please align classes before passing to 'between'.") } is.supported = function(x) is.numeric(x) || is.character(x) || is.px(x) if (is.supported(x) && is.supported(lower) && is.supported(upper)) { diff --git a/src/between.c b/src/between.c index 3ce67a1988..0c0816019f 100644 --- a/src/between.c +++ b/src/between.c @@ -79,7 +79,7 @@ SEXP between(SEXP x, SEXP lower, SEXP upper, SEXP incbounds, SEXP NAboundsArg) { case REALSXP: if (Rinherits(x, char_integer64)) { if (!Rinherits(lower, char_integer64) || !Rinherits(upper, char_integer64)) - error("Internal error: x is integer64 but lower and/or upper are not. Should have been caught at R level."); // # nocov + error("x is integer64 but lower and/or upper are not."); // e.g. between(int64, character, character) const int64_t *lp = (int64_t *)REAL(lower); const int64_t *up = (int64_t *)REAL(upper); const int64_t *xp = (int64_t *)REAL(x); @@ -101,7 +101,7 @@ SEXP between(SEXP x, SEXP lower, SEXP upper, SEXP incbounds, SEXP NAboundsArg) { if (verbose) Rprintf("between parallel processing of integer64 took %8.3fs\n", omp_get_wtime()-tic); } else { if (Rinherits(lower, char_integer64) || Rinherits(upper, char_integer64)) - error("Internal error: x is not integer64 but lower and/or upper is integer64. Should have been caught at R level."); // # nocov + error("x is not integer64 but lower and/or upper is integer64. Please align classes."); const double *lp = REAL(lower); const double *up = REAL(upper); const double *xp = REAL(x); From fb3b1599c3dd58a486d9700c2108930265d09033 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Fri, 23 Aug 2019 11:33:24 -0700 Subject: [PATCH 12/16] tidy --- R/between.R | 17 +---------------- inst/tests/tests.Rraw | 8 ++++---- man/between.Rd | 4 ++-- src/between.c | 2 +- 4 files changed, 8 insertions(+), 23 deletions(-) diff --git a/R/between.R b/R/between.R index 2e96eb1861..aa18ac3fac 100644 --- a/R/between.R +++ b/R/between.R @@ -14,21 +14,6 @@ between = function(x, lower, upper, incbounds=TRUE, NAbounds=TRUE) { "'between' function the 'x' argument is a POSIX class while 'lower' was not, coercion to POSIX failed with: ", e$message)) if (is.character(upper)) upper = tryCatch(as.POSIXct(upper, tz=tz), error=function(e)stop( "'between' function the 'x' argument is a POSIX class while 'upper' was not, coercion to POSIX failed with: ", e$message)) - - # try_posix_cast = function(x, tz) {tryCatch( - # list(status=0L, value=as.POSIXct(x, tz = tz)), - # error = function(e) list(status=1L, value=NULL, message=e[["message"]]) - # )} - # if (is.character(lower)) { - # ans = try_posix_cast(lower, tz) - # if (ans$status==0L) lower = ans$value - # else stop("'between' function the 'x' argument is a POSIX class while 'lower' was not, coercion to POSIX failed with: ", ans$message) - # } - # if (is.character(upper)) { - # ans = try_posix_cast(upper, tz) - # if (ans$status==0L) upper = ans$value - # else stop("'between' function the 'x' argument is a POSIX class while 'upper' was not, coercion to POSIX failed with: ", ans$message) - # } stopifnot(is.px(x), is.px(lower), is.px(upper)) # nocov # internal } # POSIX check timezone match @@ -48,7 +33,7 @@ between = function(x, lower, upper, incbounds=TRUE, NAbounds=TRUE) { } is.supported = function(x) is.numeric(x) || is.character(x) || is.px(x) if (is.supported(x) && is.supported(lower) && is.supported(upper)) { - # faster parallelised version for int/double. + # faster parallelised version for int/double/character # Cbetween supports length(lower)==1 (recycled) and (from v1.12.0) length(lower)==length(x). # length(upper) can be 1 or length(x) independently of lower .Call(Cbetween, x, lower, upper, incbounds, NAbounds) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 0fdacd80a5..25bd8c16e8 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -14716,8 +14716,8 @@ options(old) # between int64 support if (test_bit64) { as.i64 = bit64::as.integer64 - test(2039.01, between(1:10, as.i64(3), as.i64(6)), error="are integer64 class while 'x' is not") - test(2039.02, between(1:10, 3, as.i64(6)), error="are integer64 class while 'x' is not") + test(2039.01, between(1:10, as.i64(3), as.i64(6)), error="x is not integer64 but.*Please align classes") + test(2039.02, between(1:10, 3, as.i64(6)), error="x is not integer64 but.*Please align classes") old = options("datatable.verbose"=TRUE) x = as.i64(1:10) ans36 = c(FALSE,FALSE,TRUE,TRUE,TRUE,TRUE,FALSE,FALSE,FALSE,FALSE) @@ -15704,8 +15704,8 @@ if (test_nanotime) { test(2080.02, between(n, nanotime(3), nanotime(10), incbounds=FALSE), c(FALSE, NA, FALSE, TRUE), output="between parallel processing of integer64") test(2080.03, between(n, nanotime(3), nanotime(NA), incbounds=FALSE, NAbounds=NA), c(FALSE, NA, FALSE, NA), output="between parallel processing of integer64") options(op) - test(2080.04, between(1:10, nanotime(3), nanotime(6)), error="are integer64 class while 'x' is not") - test(2080.05, between(1:10, 3, nanotime(6)), error="are integer64 class while 'x' is not") + test(2080.04, between(1:10, nanotime(3), nanotime(6)), error="x is not integer64 but.*Please align classes") + test(2080.05, between(1:10, 3, nanotime(6)), error="x is not integer64 but.*Please align classes") } diff --git a/man/between.Rd b/man/between.Rd index 28120d8797..b62686325e 100644 --- a/man/between.Rd +++ b/man/between.Rd @@ -16,7 +16,7 @@ This can be changed by setting \code{NAbounds} to \code{TRUE}. the intervals provided in \code{lower,upper}. } \usage{ -between(x, lower, upper, incbounds=TRUE, NAbounds=FALSE) +between(x, lower, upper, incbounds=TRUE, NAbounds=TRUE) x \%between\% y inrange(x, lower, upper, incbounds=TRUE) @@ -33,7 +33,7 @@ interpreted as \code{lower} and \code{y[[2]]} as \code{upper}.} \item{incbounds}{ \code{TRUE} means inclusive bounds, i.e., [lower,upper]. \code{FALSE} means exclusive bounds, i.e., (lower,upper). It is set to \code{TRUE} by default for infix notations.} -\item{NAbounds}{ \code{TRUE} force to treat \code{NA} bound as \code{NA} rather than missing bound.} +\item{NAbounds}{ If \code{lower} contains an \code{NA} what should \code{lower<=x} return? By default \code{TRUE} so that a missing bound is interpreted as unlimited; e.g., the \code{NA} is treated as if it were \code{-Inf} in the case of \code{numeric}. Similarly for \code{upper}. } } \details{ diff --git a/src/between.c b/src/between.c index 0c0816019f..5554941533 100644 --- a/src/between.c +++ b/src/between.c @@ -149,7 +149,7 @@ SEXP between(SEXP x, SEXP lower, SEXP upper, SEXP incbounds, SEXP NAboundsArg) { const SEXP *xp = STRING_PTR(x); #define LCMP strcmp(CHAR(ENC2UTF8(l)),CHAR(ENC2UTF8(elem)))<=-open #define UCMP strcmp(CHAR(ENC2UTF8(elem)),CHAR(ENC2UTF8(u)))<=-open - // ENC2UTF8 could allocate so cannot be parallel (TODO: if all ascii, can be parallel) TODO: needs protection + // TODO if all ascii can be parallel, otherwise ENC2UTF8 could allocate if (NAbounds) { for (int i=0; i Date: Fri, 23 Aug 2019 12:28:09 -0700 Subject: [PATCH 13/16] use word 'unlimited bound' in news item and man page --- NEWS.md | 2 +- man/between.Rd | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/NEWS.md b/NEWS.md index 156aedd2c2..dbef189c0f 100644 --- a/NEWS.md +++ b/NEWS.md @@ -228,7 +228,7 @@ 27. `integer64` sum-by-group is now properly optimized, [#1647](https://github.com/Rdatatable/data.table/issues/1647), [#3464](https://github.com/Rdatatable/data.table/issues/3464). Thanks to @mlandry22-h2o for the report. -28. From v1.12.0, `between()` and `%between%` interpreted missing values in `lower=` or `upper=` as missing bounds and return `TRUE` rather than unknown bound returning `NA`, [#3522](https://github.com/Rdatatable/data.table/issues/3522) (thanks @cguill95). New argument `NAbound` has been added to achieve the old behaviour; see `?between`. This is now consistent for character input, [#3667](https://github.com/Rdatatable/data.table/issues/3667) (thanks @AnonymousBoba), and class `nanotime` is now supported. +28. From v1.12.0 `between()` and `%between%` interpret missing values in `lower=` or `upper=` as unlimited bounds. A new parameter `NAbounds` has been added to achieve the old behaviour of returning `NA`, [#3522](https://github.com/Rdatatable/data.table/issues/3522). Thanks @cguill95 for reporting. This is now consistent for character input, [#3667](https://github.com/Rdatatable/data.table/issues/3667) (thanks @AnonymousBoba), and class `nanotime` is now supported too. #### NOTES diff --git a/man/between.Rd b/man/between.Rd index b62686325e..97d96fe377 100644 --- a/man/between.Rd +++ b/man/between.Rd @@ -7,10 +7,10 @@ \description{ Intended for use in \code{i} in \code{[.data.table}. -\code{between} is equivalent to \code{x >= lower & x <= upper} when -\code{incbounds=TRUE}, or \code{x > lower & y < upper} when \code{FALSE}. With a caveat that -\code{NA} in \code{lower} or \code{upper} are taken as a missing bound and return \code{TRUE} not \code{NA}. -This can be changed by setting \code{NAbounds} to \code{TRUE}. +\code{between} is equivalent to \code{lower<=x & x<=upper} when +\code{incbounds=TRUE}, or \code{lower Date: Fri, 23 Aug 2019 13:45:13 -0700 Subject: [PATCH 14/16] coverage --- R/between.R | 3 +-- inst/tests/tests.Rraw | 59 ++++++++++++++++++++++++------------------- src/between.c | 4 +-- 3 files changed, 36 insertions(+), 30 deletions(-) diff --git a/R/between.R b/R/between.R index aa18ac3fac..b9588495cb 100644 --- a/R/between.R +++ b/R/between.R @@ -1,6 +1,5 @@ # is x[i] in between lower[i] and upper[i] ? between = function(x, lower, upper, incbounds=TRUE, NAbounds=TRUE) { - if (!isTRUEorNA(NAbounds)) stop("NAbounds must be TRUE or NA") if (is.logical(x)) stop("between has been x of type logical") if (is.logical(lower)) lower = as.integer(lower) # typically NA (which is logical type) if (is.logical(upper)) upper = as.integer(upper) # typically NA (which is logical type) @@ -39,7 +38,7 @@ between = function(x, lower, upper, incbounds=TRUE, NAbounds=TRUE) { .Call(Cbetween, x, lower, upper, incbounds, NAbounds) } else { if (isTRUE(getOption("datatable.verbose"))) cat("optimised between not available for this data type, fallback to slow R routine\n") - if (isTRUE(NAbounds)) stop("Not yet implemented NAbounds for this non-numeric and non-character type") + if (isTRUE(NAbounds) && (anyNA(lower) || anyNA(upper))) stop("Not yet implemented NAbounds=TRUE for this non-numeric and non-character type") if (incbounds) x>=lower & x<=upper else x>lower & x= 1L & x <= 5L) test(1529.05, x %between% c(1L, 5L), x >= 1L & x <= 5L) test(1529.06, between(x, 1L, 5L, FALSE), x > 1L & x < 5L) x = runif(15) -test(1529.07, between(x, 0.25, 0.75, TRUE), x >= 0.25 & x <= 0.75) +test(1529.07, between(x, 0.25, 0.75, incbounds=TRUE), x >= 0.25 & x <= 0.75) test(1529.08, x %between% c(0.25, 0.75), x >= 0.25 & x <= 0.75) -test(1529.09, between(x, 0.25, 0.75, FALSE), x > 0.25 & x < 0.75) +test(1529.09, between(x, 0.25, 0.75, incbounds=FALSE), x > 0.25 & x < 0.75) x = c(NA, runif(15), NA) -test(1529.10, between(x, 0.25, 0.75, TRUE), x >= 0.25 & x <= 0.75) +test(1529.10, between(x, 0.25, 0.75, incbounds=TRUE), x >= 0.25 & x <= 0.75) test(1529.11, x %between% c(0.25, 0.75), x >= 0.25 & x <= 0.75) -test(1529.12, between(x, 0.25, 0.75, FALSE), x > 0.25 & x < 0.75) +test(1529.12, between(x, 0.25, 0.75, incbounds=FALSE), x > 0.25 & x < 0.75) # add tests for which.first and which.last # which.first @@ -14657,8 +14657,8 @@ test(2037.2, names(DT), 'a') test(2037.3, unname(table(unlist(strsplit(capture.output(foo(DT)), '\n|\\s+')))['ptr']), 1L) # `between` invalid args, and verbose #3516 -test(2038.01, between(1:5, 2, 4, incbounds=423), error="incbounds must be logical") -test(2038.02, between(1:5, 2, 4, incbounds=NA), error="incbounds must be logical") +test(2038.01, between(1:5, 2, 4, incbounds=423), error="incbounds must be TRUE or FALSE") +test(2038.02, between(1:5, 2, 4, incbounds=NA), error="incbounds must be TRUE or FALSE") old = options(datatable.verbose=TRUE) test(2038.03, between(1:5, 2L, 4L), output="between parallel processing of integer took") test(2038.04, between(1:5, rep(2L,5L), rep(4L, 5L)), output="between parallel processing of integer took") @@ -14716,34 +14716,35 @@ options(old) # between int64 support if (test_bit64) { as.i64 = bit64::as.integer64 - test(2039.01, between(1:10, as.i64(3), as.i64(6)), error="x is not integer64 but.*Please align classes") - test(2039.02, between(1:10, 3, as.i64(6)), error="x is not integer64 but.*Please align classes") + test(2039.01, between(1:10, as.i64(3), as.i64(6)), error="x is not integer64 but.*Please align classes") + test(2039.02, between(1:10, 3, as.i64(6)), error="x is not integer64 but.*Please align classes") + test(2039.03, between(as.i64(1:3), "2", as.i64(4)), error="x is integer64 but lower and/or upper are not") old = options("datatable.verbose"=TRUE) x = as.i64(1:10) ans36 = c(FALSE,FALSE,TRUE,TRUE,TRUE,TRUE,FALSE,FALSE,FALSE,FALSE) ans36open = c(FALSE,FALSE,FALSE,TRUE,TRUE,FALSE,FALSE,FALSE,FALSE,FALSE) - test(2039.03, between(x, 3, 6), ans36, output="between parallel processing of integer64 took") - test(2039.04, between(x, 3, 6, incbounds=FALSE), ans36open, output="between parallel processing of integer64 took") - test(2039.05, between(x, 3L, 6), ans36, output="between parallel processing of integer64 took") - test(2039.06, between(x, 3L, 6, incbounds=FALSE), ans36open, output="between parallel processing of integer64 took") - test(2039.07, between(x, 3, 6L), ans36, output="between parallel processing of integer64 took") - test(2039.08, between(x, 3, 6L, incbounds=FALSE), ans36open, output="between parallel processing of integer64 took") - test(2039.09, between(x, 3L, 6L), ans36, output="between parallel processing of integer64 took") - test(2039.10, between(x, 3L, 6L, incbounds=FALSE), ans36open, output="between parallel processing of integer64 took") - test(2039.11, between(x, rep(3, 10L), rep(6, 10L)), ans36, output="between parallel processing of integer64 took") - test(2039.12, between(x, rep(3, 10L), rep(6, 10L), incbounds=FALSE), ans36open, output="between parallel processing of integer64 took") + test(2039.04, between(x, 3, 6), ans36, output="between parallel processing of integer64 took") + test(2039.05, between(x, 3, 6, incbounds=FALSE), ans36open, output="between parallel processing of integer64 took") + test(2039.06, between(x, 3L, 6), ans36, output="between parallel processing of integer64 took") + test(2039.07, between(x, 3L, 6, incbounds=FALSE), ans36open, output="between parallel processing of integer64 took") + test(2039.08, between(x, 3, 6L), ans36, output="between parallel processing of integer64 took") + test(2039.09, between(x, 3, 6L, incbounds=FALSE), ans36open, output="between parallel processing of integer64 took") + test(2039.10, between(x, 3L, 6L), ans36, output="between parallel processing of integer64 took") + test(2039.11, between(x, 3L, 6L, incbounds=FALSE), ans36open, output="between parallel processing of integer64 took") + test(2039.12, between(x, rep(3, 10L), rep(6, 10L)), ans36, output="between parallel processing of integer64 took") + test(2039.13, between(x, rep(3, 10L), rep(6, 10L), incbounds=FALSE), ans36open, output="between parallel processing of integer64 took") maxint = 2147483647 - test(2039.13, between(x+maxint, 3+maxint, 6+maxint), ans36, output="between parallel processing of integer64 took") - test(2039.14, between(x+maxint, 3+maxint, 6+maxint, incbounds=FALSE), ans36open, output="between parallel processing of integer64 took") + test(2039.14, between(x+maxint, 3+maxint, 6+maxint), ans36, output="between parallel processing of integer64 took") + test(2039.15, between(x+maxint, 3+maxint, 6+maxint, incbounds=FALSE), ans36open, output="between parallel processing of integer64 took") x[5] = NA ans36[5] = NA ans36open[5] = NA - test(2039.15, between(x+maxint, 3+maxint, 6+maxint), ans36, output="between parallel processing of integer64 took") - test(2039.16, between(x+maxint, 3+maxint, 6+maxint, incbounds=FALSE), ans36open, output="between parallel processing of integer64 took") - test(2039.17, between(x+maxint, NA, 6+maxint), c(TRUE, TRUE, tail(ans36, -2L)), output="between parallel processing of integer64 took") - test(2039.18, between(x+maxint, 3+maxint, NA, incbounds=FALSE), c(head(ans36open, -5L), rep(TRUE, 5)), output="between parallel processing of integer64 took") - test(2039.19, between(x+maxint, rep(NA, 10L), rep(6+maxint, 10L)), c(TRUE, TRUE, tail(ans36, -2L)), output="between parallel processing of integer64 took") - test(2039.20, between(x+maxint, rep(3+maxint, 10L), rep(NA, 10L), incbounds=FALSE), c(head(ans36open, -5L), rep(TRUE, 5)), output="between parallel processing of integer64 took") + test(2039.16, between(x+maxint, 3+maxint, 6+maxint), ans36, output="between parallel processing of integer64 took") + test(2039.17, between(x+maxint, 3+maxint, 6+maxint, incbounds=FALSE), ans36open, output="between parallel processing of integer64 took") + test(2039.18, between(x+maxint, NA, 6+maxint), c(TRUE, TRUE, tail(ans36, -2L)), output="between parallel processing of integer64 took") + test(2039.19, between(x+maxint, 3+maxint, NA, incbounds=FALSE), c(head(ans36open, -5L), rep(TRUE, 5)), output="between parallel processing of integer64 took") + test(2039.20, between(x+maxint, rep(NA, 10L), rep(6+maxint, 10L)), c(TRUE, TRUE, tail(ans36, -2L)), output="between parallel processing of integer64 took") + test(2039.21, between(x+maxint, rep(3+maxint, 10L), rep(NA, 10L), incbounds=FALSE), c(head(ans36open, -5L), rep(TRUE, 5)), output="between parallel processing of integer64 took") options(old) } @@ -15707,6 +15708,12 @@ if (test_nanotime) { test(2080.04, between(1:10, nanotime(3), nanotime(6)), error="x is not integer64 but.*Please align classes") test(2080.05, between(1:10, 3, nanotime(6)), error="x is not integer64 but.*Please align classes") } +# use raw type to cover fallback to R in between.R +old = options(datatable.verbose=TRUE) +test(2081.01, between(as.raw(1:5), as.raw(2), as.raw(4)), c(FALSE, TRUE, TRUE, TRUE, FALSE), output="fallback to slow R") +test(2081.02, between(as.raw(1:5), as.raw(2), as.raw(4), incbounds=FALSE), c(FALSE, FALSE, TRUE, FALSE, FALSE), output="fallback to slow R") +options(old) +test(2081.03, between(3i, NA, NA), error="Not yet implemented NAbounds=TRUE for this non-numeric and non-character type") ################################### diff --git a/src/between.c b/src/between.c index 5554941533..6b9d414088 100644 --- a/src/between.c +++ b/src/between.c @@ -12,10 +12,10 @@ SEXP between(SEXP x, SEXP lower, SEXP upper, SEXP incbounds, SEXP NAboundsArg) { error("Incompatible vector lengths: length(x)==%d length(lower)==%d length(upper)==%d. Each should be either length 1 or the length of the longest.", nx, nl, nu); } if (!isLogical(incbounds) || LOGICAL(incbounds)[0]==NA_LOGICAL) - error("incbounds must be logical TRUE or FALSE"); + error("incbounds must be TRUE or FALSE"); const bool open = !LOGICAL(incbounds)[0]; if (!isLogical(NAboundsArg) || LOGICAL(NAboundsArg)[0]==FALSE) - error("NAbounds must be logical TRUE or NA"); + error("NAbounds must be TRUE or NA"); const bool NAbounds = LOGICAL(NAboundsArg)[0]==TRUE; const bool verbose = GetVerbose(); From 1ad6c774892300974883390ce6e864b7fcf448b7 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Fri, 23 Aug 2019 14:24:10 -0700 Subject: [PATCH 15/16] coverage --- inst/tests/tests.Rraw | 22 ++++++++++++++++------ src/between.c | 6 +++--- 2 files changed, 19 insertions(+), 9 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 17e2c90ab3..bb3617e861 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -7162,10 +7162,18 @@ x = runif(15) test(1529.07, between(x, 0.25, 0.75, incbounds=TRUE), x >= 0.25 & x <= 0.75) test(1529.08, x %between% c(0.25, 0.75), x >= 0.25 & x <= 0.75) test(1529.09, between(x, 0.25, 0.75, incbounds=FALSE), x > 0.25 & x < 0.75) +test(1529.10, between(x, 0.25, NA, NAbounds=NA), ifelse(x<=0.25, FALSE, NA)) +test(1529.11, between(x, NA, 0.75, NAbounds=NA), ifelse(x>=0.75, FALSE, NA)) +test(1529.12, between(x, NA, NA, NAbounds=NA), rep(NA, length(x))) +test(1529.13, between(x, NA, NA, NAbounds=TRUE), rep(TRUE, length(x))) +test(1529.14, between(x, x[3], NA, incbounds=FALSE, NAbounds=NA), ifelse(x<=x[3], FALSE, NA)) +test(1529.15, between(x, x[3], NA, incbounds=TRUE, NAbounds=NA), ifelse(x=x[9], FALSE, NA)) +test(1529.17, between(x, NA, x[9], incbounds=TRUE, NAbounds=NA), ifelse(x>x[9], FALSE, NA)) x = c(NA, runif(15), NA) -test(1529.10, between(x, 0.25, 0.75, incbounds=TRUE), x >= 0.25 & x <= 0.75) -test(1529.11, x %between% c(0.25, 0.75), x >= 0.25 & x <= 0.75) -test(1529.12, between(x, 0.25, 0.75, incbounds=FALSE), x > 0.25 & x < 0.75) +test(1529.18, between(x, 0.25, 0.75, incbounds=TRUE), x >= 0.25 & x <= 0.75) +test(1529.19, x %between% c(0.25, 0.75), x >= 0.25 & x <= 0.75) +test(1529.20, between(x, 0.25, 0.75, incbounds=FALSE), x > 0.25 & x < 0.75) # add tests for which.first and which.last # which.first @@ -15690,9 +15698,11 @@ test(2078.25, between(c("a","c","e"), c("b",NA,"d"), NA, incbounds=TRUE), c(FALS test(2078.26, between(c("a","c","e"), c("b",NA,"d"), NA, incbounds=FALSE), c(FALSE, TRUE, TRUE)) test(2078.27, between(c("a","c","e"), "a", c("b",NA,"d"), incbounds=TRUE), c(TRUE, TRUE, FALSE)) test(2078.28, between(c("a","c","e"), "a", c("b",NA,"d"), incbounds=FALSE), c(FALSE, TRUE, FALSE)) -test(2078.29, between(c("a","c","e"), NA, c("b",NA,"d"), incbounds=TRUE), c(TRUE, TRUE, FALSE)) -test(2078.30, between(c("a","c","e"), NA, c("b",NA,"d"), incbounds=FALSE), c(TRUE, TRUE, FALSE)) -# between NA bound as unknown rather than missing #3522 +test(2078.29, between(c("a","c","e"), NA, c("b",NA,"e"), incbounds=TRUE), c(TRUE, TRUE, TRUE)) +test(2078.30, between(c("a","c","e"), NA, c("b",NA,"e"), incbounds=FALSE), c(TRUE, TRUE, FALSE)) +test(2078.31, between(c("a","c","e"), NA, c("b",NA,"e"), incbounds=TRUE, NAbounds=NA), c(NA, NA, NA)) +test(2078.32, between(c("a","c","e"), NA, c("b",NA,"e"), incbounds=FALSE, NAbounds=NA), c(NA, NA, FALSE)) +# between NA bound as unknown rather than unlimited, #3522 test(2079.01, between(1:5, 3L, NA, incbounds=TRUE, NAbounds=NA), c(FALSE, FALSE, NA, NA, NA)) test(2079.02, between(1:5, 3L, NA, incbounds=FALSE, NAbounds=TRUE), c(FALSE, FALSE, FALSE, TRUE, TRUE)) test(2079.03, between(1:5, 3L, NA, incbounds=FALSE, NAbounds=FALSE), error="NAbounds must be TRUE or NA") diff --git a/src/between.c b/src/between.c index 6b9d414088..6d620dbd2f 100644 --- a/src/between.c +++ b/src/between.c @@ -147,8 +147,8 @@ SEXP between(SEXP x, SEXP lower, SEXP upper, SEXP incbounds, SEXP NAboundsArg) { const SEXP *lp = STRING_PTR(lower); const SEXP *up = STRING_PTR(upper); const SEXP *xp = STRING_PTR(x); - #define LCMP strcmp(CHAR(ENC2UTF8(l)),CHAR(ENC2UTF8(elem)))<=-open - #define UCMP strcmp(CHAR(ENC2UTF8(elem)),CHAR(ENC2UTF8(u)))<=-open + #define LCMP (strcmp(CHAR(ENC2UTF8(l)),CHAR(ENC2UTF8(elem)))<=-open) + #define UCMP (strcmp(CHAR(ENC2UTF8(elem)),CHAR(ENC2UTF8(u)))<=-open) // TODO if all ascii can be parallel, otherwise ENC2UTF8 could allocate if (NAbounds) { for (int i=0; i Date: Fri, 23 Aug 2019 15:37:45 -0700 Subject: [PATCH 16/16] check lower<=upper --- inst/tests/tests.Rraw | 7 ++++++- man/between.Rd | 3 ++- src/between.c | 44 ++++++++++++++++++++++++++++++------------- 3 files changed, 39 insertions(+), 15 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index bb3617e861..964e9181aa 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -9449,7 +9449,7 @@ test(1656, nrow(dt[dt, .(x.id, x.a, x.b, x.c, x.row_id, i.id, i.a, i.b, i.c, i.r # between is vectorised, #534 set.seed(1L) -dt = data.table(x=sample(3,10,TRUE), y=sample(2,10,TRUE), z=sample(5,10,TRUE)) +dt = data.table(x=sample(3,10,TRUE), y=sample(2,10,TRUE), z=1L+sample(2,10,TRUE)) test(1657, dt[x %between% list(y,z)], dt[x>=y & x<=z]) # fwrite tests @@ -15724,6 +15724,11 @@ test(2081.01, between(as.raw(1:5), as.raw(2), as.raw(4)), c(FAL test(2081.02, between(as.raw(1:5), as.raw(2), as.raw(4), incbounds=FALSE), c(FALSE, FALSE, TRUE, FALSE, FALSE), output="fallback to slow R") options(old) test(2081.03, between(3i, NA, NA), error="Not yet implemented NAbounds=TRUE for this non-numeric and non-character type") +# check that lower<=upper +test(2082.01, between(1:3, INT(2,3,4), 2L), error="Item 2 of lower (3) is greater than item 1 of upper (2)") +test(2082.02, between(as.double(1:3), c(2,3,4), 2), error="Item 2 of lower [(]3.*[)] is greater than item 1 of upper [(]2.*[)]") +if (test_bit64) test(2082.03, between(as.integer64(1:3), as.integer64(2), as.integer64(1)), error="Item 1 of lower (2) is greater than item 1 of upper (1)") +test(2082.04, between(letters[1:2], c("foo","bar"), c("bar")), error="Item 1 of lower ('foo') is greater than item 1 of upper ('bar')") ################################### diff --git a/man/between.Rd b/man/between.Rd index 97d96fe377..f397b34d4d 100644 --- a/man/between.Rd +++ b/man/between.Rd @@ -10,7 +10,8 @@ Intended for use in \code{i} in \code{[.data.table}. \code{between} is equivalent to \code{lower<=x & x<=upper} when \code{incbounds=TRUE}, or \code{lowerupper}. \code{inrange} checks whether each value in \code{x} is in between any of the intervals provided in \code{lower,upper}. diff --git a/src/between.c b/src/between.c index 6d620dbd2f..344a9c75af 100644 --- a/src/between.c +++ b/src/between.c @@ -11,6 +11,7 @@ SEXP between(SEXP x, SEXP lower, SEXP upper, SEXP incbounds, SEXP NAboundsArg) { (nx!=1 && nx!=longest)) { error("Incompatible vector lengths: length(x)==%d length(lower)==%d length(upper)==%d. Each should be either length 1 or the length of the longest.", nx, nl, nu); } + const int longestBound = MAX(nl, nu); if (!isLogical(incbounds) || LOGICAL(incbounds)[0]==NA_LOGICAL) error("incbounds must be TRUE or FALSE"); const bool open = !LOGICAL(incbounds)[0]; @@ -39,9 +40,6 @@ SEXP between(SEXP x, SEXP lower, SEXP upper, SEXP incbounds, SEXP NAboundsArg) { upper = PROTECT(coerceVector(upper, TYPEOF(x))); nprotect++; } - // *** TODO ***: sweep through lower and upper (inside cases below due to integer64) - // ensuring lower<=upper (inc bounds) and no lower>upper or lower==INT_MAX - const bool recycleX = nx==1; const bool recycleLow = nl==1; const bool recycleUpp = nu==1; @@ -57,16 +55,21 @@ SEXP between(SEXP x, SEXP lower, SEXP upper, SEXP incbounds, SEXP NAboundsArg) { const int *lp = INTEGER(lower); const int *up = INTEGER(upper); const int *xp = INTEGER(x); + for (int i=0; iu) + error("Item %d of lower (%d) is greater than item %d of upper (%d)", (i&lowMask)+1, l, (i&uppMask)+1, u); + } if (NAbounds) { // default NAbounds==TRUE => NA bound means TRUE; i.e. asif lower=-Inf or upper==Inf) #pragma omp parallel for num_threads(getDTthreads()) - for (int i=0; i= and <=. NA_INTEGER+1 == -INT_MAX == INT_MIN+1 (so NA limit handled by this too) } } else { #pragma omp parallel for num_threads(getDTthreads()) - for (int i=0; iu) + error("Item %d of lower (%lld) is greater than item %d of upper (%lld)", (i&lowMask)+1, l, (i&uppMask)+1, u); + } if (NAbounds) { #pragma omp parallel for num_threads(getDTthreads()) - for (int i=0; iu) + error("Item %d of lower (%f) is greater than item %d of upper (%f)", (i&lowMask)+1, l, (i&uppMask)+1, u); + } if (open) { if (NAbounds) { #pragma omp parallel for num_threads(getDTthreads()) - for (int i=0; i0) + error("Item %d of lower ('%s') is greater than item %d of upper ('%s')", (i&lowMask)+1, CHAR(l), (i&uppMask)+1, CHAR(u)); + } if (NAbounds) { - for (int i=0; i