From 8047463895e9bbe420e6fa4e02841bb7826eb435 Mon Sep 17 00:00:00 2001 From: jangorecki Date: Mon, 22 Apr 2019 19:32:02 +0200 Subject: [PATCH 01/22] between incbounds, rm nocov, and test --- inst/tests/tests.Rraw | 4 ++++ src/between.c | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 3e408f87d7..2aa7ac13a9 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -14054,6 +14054,10 @@ test(2027.1, DT[, list(1, ), by=a], error = 'Item 2 of the .() or list() passed test(2027.2, DT[, list(1,2,)], error = 'Item 3 of the .() or list() passed to j is missing') test(2027.3, DT[, .(1,,3,), by=a], error = 'Item 2 of the .() or list() passed to j is missing') +# `between` invalid arg +test(2028.1, between(1:5, 2,4, incbounds=423), error="incbounds must be logical") +test(2028.2, between(1:5, 2,4, incbounds=NA), error="incbounds must be logical") + ################################### # Add new tests above this line # diff --git a/src/between.c b/src/between.c index c5b97aecc4..67712bdf5a 100644 --- a/src/between.c +++ b/src/between.c @@ -12,7 +12,7 @@ SEXP between(SEXP x, SEXP lower, SEXP upper, SEXP bounds) { error("Incompatible vector lengths: length(x)==%d length(lower)==%d length(upper)==%d. Each should be either length 1 or the length of the longest.", nx, nl, nu); } if (!isLogical(bounds) || LOGICAL(bounds)[0] == NA_LOGICAL) - error("incbounds must be logical TRUE/FALSE."); // # nocov + error("incbounds must be logical TRUE/FALSE."); int nprotect = 0; bool integer=true; From a153f9d0b884678e64d6d4980e9d6fe33c5b1322 Mon Sep 17 00:00:00 2001 From: jangorecki Date: Mon, 22 Apr 2019 20:16:10 +0200 Subject: [PATCH 02/22] add verbose to between function, closes #3516 --- R/between.R | 7 ++++--- inst/tests/tests.Rraw | 14 +++++++++++--- man/between.Rd | 3 ++- src/between.c | 15 ++++++++++++++- src/data.table.h | 3 +++ 5 files changed, 34 insertions(+), 8 deletions(-) diff --git a/R/between.R b/R/between.R index 759ab6dff1..cba84c17f9 100644 --- a/R/between.R +++ b/R/between.R @@ -1,5 +1,5 @@ # is x[i] in between lower[i] and upper[i] ? -between <- function(x,lower,upper,incbounds=TRUE) { +between <- function(x,lower,upper,incbounds=TRUE,verbose=getOption("datatable.verbose")) { if (is.logical(x)) stop("between has been x of type logical") if (is.logical(lower)) lower = as.integer(lower) # typically NA (which is logical type) if (is.logical(upper)) upper = as.integer(upper) # typically NA (which is logical type) @@ -8,10 +8,11 @@ between <- function(x,lower,upper,incbounds=TRUE) { # faster parallelised version for int/double. # Cbetween supports length(lower)==1 (recycled) and (from v1.12.0) length(lower)==length(x). # length(upper) can be 1 or length(x) independently of lower - .Call(Cbetween, x, lower, upper, incbounds) + .Call(Cbetween, x, lower, upper, incbounds, verbose) } else { + if (isTRUE(verbose)) cat("between on character field, fallback to slow R routine\n") # now just for character input. TODO: support character between in Cbetween and remove this branch - if(incbounds) x>=lower & x<=upper + if (incbounds) x>=lower & x<=upper else x>lower & x= and <=. NA_INTEGER+1 == -INT_MAX == INT_MIN+1 (so NA limit handled by this too) const int u = up[0]==NA_INTEGER ? INT_MAX : up[0] - open; + if (LOGICAL(verbose)[0]) tic = omp_get_wtime(); #pragma omp parallel for num_threads(getDTthreads()) for (int i=0; i Date: Mon, 22 Apr 2019 20:41:30 +0200 Subject: [PATCH 03/22] between support for POSIXct, closes #3519 --- NEWS.md | 2 ++ R/between.R | 7 ++++++- inst/tests/tests.Rraw | 8 +++++++- 3 files changed, 15 insertions(+), 2 deletions(-) diff --git a/NEWS.md b/NEWS.md index bb7d6aba93..311517fb9a 100644 --- a/NEWS.md +++ b/NEWS.md @@ -46,6 +46,8 @@ 3. A missing item in `j` such as `j=.(colA, )` now gives a helpful error (`Item 2 of the .() or list() passed to j is missing`) rather than the unhelpful error `argument "this_jsub" is missing, with no default` (v1.12.2) or `argument 2 is empty` (v1.12.0 and before), [#3507](https://github.com/Rdatatable/data.table/issues/3507). Thanks to @eddelbuettel for the report. +4. `between` now properly handles `POSIXct` type, [#3519](https://github.com/Rdatatable/data.table/issues/3519). + #### NOTES 1. `rbindlist`'s `use.names="check"` now emits its message for automatic column names (`"V[0-9]+"`) too, [#3484](https://github.com/Rdatatable/data.table/pull/3484). See news item 5 of v1.12.2 below. diff --git a/R/between.R b/R/between.R index cba84c17f9..a51fe1015f 100644 --- a/R/between.R +++ b/R/between.R @@ -4,13 +4,18 @@ between <- function(x,lower,upper,incbounds=TRUE,verbose=getOption("datatable.ve if (is.logical(lower)) lower = as.integer(lower) # typically NA (which is logical type) if (is.logical(upper)) upper = as.integer(upper) # typically NA (which is logical type) is_strictly_numeric <- function(x) is.numeric(x) && !inherits(x, "integer64") + if (inherits(x, "POSIXct") && inherits(lower, "POSIXct") && inherits(upper, "POSIXct")) { #3519 + x = unclass(x) + lower = unclass(lower) + upper = unclass(upper) + } if (is_strictly_numeric(x) && is_strictly_numeric(lower) && is_strictly_numeric(upper)) { # faster parallelised version for int/double. # Cbetween supports length(lower)==1 (recycled) and (from v1.12.0) length(lower)==length(x). # length(upper) can be 1 or length(x) independently of lower .Call(Cbetween, x, lower, upper, incbounds, verbose) } else { - if (isTRUE(verbose)) cat("between on character field, fallback to slow R routine\n") + if (isTRUE(verbose)) cat("optimised between not available for this data type, fallback to slow R routine\n") # now just for character input. TODO: support character between in Cbetween and remove this branch if (incbounds) x>=lower & x<=upper else x>lower & x Date: Mon, 22 Apr 2019 21:02:19 +0200 Subject: [PATCH 04/22] between support dot-list alias in RHS, closes #2315 --- NEWS.md | 2 +- R/between.R | 6 +++++- inst/tests/tests.Rraw | 3 +++ 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/NEWS.md b/NEWS.md index 311517fb9a..4d24690014 100644 --- a/NEWS.md +++ b/NEWS.md @@ -46,7 +46,7 @@ 3. A missing item in `j` such as `j=.(colA, )` now gives a helpful error (`Item 2 of the .() or list() passed to j is missing`) rather than the unhelpful error `argument "this_jsub" is missing, with no default` (v1.12.2) or `argument 2 is empty` (v1.12.0 and before), [#3507](https://github.com/Rdatatable/data.table/issues/3507). Thanks to @eddelbuettel for the report. -4. `between` now properly handles `POSIXct` type, [#3519](https://github.com/Rdatatable/data.table/issues/3519). +4. `between` now properly handles `POSIXct` type, [#3519](https://github.com/Rdatatable/data.table/issues/3519). Additionally support `.` as `list` alias in RHS, [#2315](https://github.com/Rdatatable/data.table/issues/2315). Thanks to @Henrik-P for the report. #### NOTES diff --git a/R/between.R b/R/between.R index a51fe1015f..6c12cb1584 100644 --- a/R/between.R +++ b/R/between.R @@ -24,8 +24,12 @@ between <- function(x,lower,upper,incbounds=TRUE,verbose=getOption("datatable.ve # %between% is vectorised, #534. "%between%" <- function(x, y) { + ysub = substitute(y) + if (ysub[[1L]]==".") { + ysub[[1L]]=quote(list) + y = eval.parent(ysub) + } if ((l <- length(y)) != 2L) { - ysub = substitute(y) stop("RHS has length() ", l, "; expecting length 2. ", if (is.call(ysub) && ysub[[1L]] == 'c') sprintf("Perhaps you meant %s? ", diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 2bff1137e6..5dbdb6e1fb 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -14071,6 +14071,9 @@ dn = as.POSIXct('2016-09-18 08:00:00') up = as.POSIXct('2016-09-18 09:00:00') test(2028.11, between(x, dn, up, verbose=TRUE), output="between parallel processing of double using closed bounds with recycling took") test(2028.12, between(x, dn, up, incbounds=FALSE, verbose=TRUE), output="between parallel processing of double using open bounds with recycling took") +# `between` support `.` in RHS #2315 +X = data.table(a = 1:5, b = 6:10, c = c(5:1)) +test(2028.13, X[c %between% list(a, b)], X[c %between% .(a, b)]) ################################### From ccf0b42a074744a26277c40f9803fdb4c4dca6f5 Mon Sep 17 00:00:00 2001 From: jangorecki Date: Mon, 22 Apr 2019 21:39:48 +0200 Subject: [PATCH 05/22] fix missing check for is.call --- R/between.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/between.R b/R/between.R index 6c12cb1584..15641d6afb 100644 --- a/R/between.R +++ b/R/between.R @@ -25,7 +25,7 @@ between <- function(x,lower,upper,incbounds=TRUE,verbose=getOption("datatable.ve # %between% is vectorised, #534. "%between%" <- function(x, y) { ysub = substitute(y) - if (ysub[[1L]]==".") { + if (is.call(ysub) && ysub[[1L]]==".") { ysub[[1L]]=quote(list) y = eval.parent(ysub) } From 47b52e341f399e413a4d49fb93a330b2402886b3 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Tue, 23 Apr 2019 18:37:48 +0800 Subject: [PATCH 06/22] new functionality for parallel processing to apply to POSIX-as-string RHS --- R/between.R | 21 +++++++++++++++++---- inst/tests/tests.Rraw | 13 ++++++++++++- 2 files changed, 29 insertions(+), 5 deletions(-) diff --git a/R/between.R b/R/between.R index 15641d6afb..347b9fbd51 100644 --- a/R/between.R +++ b/R/between.R @@ -4,10 +4,23 @@ between <- function(x,lower,upper,incbounds=TRUE,verbose=getOption("datatable.ve if (is.logical(lower)) lower = as.integer(lower) # typically NA (which is logical type) if (is.logical(upper)) upper = as.integer(upper) # typically NA (which is logical type) is_strictly_numeric <- function(x) is.numeric(x) && !inherits(x, "integer64") - if (inherits(x, "POSIXct") && inherits(lower, "POSIXct") && inherits(upper, "POSIXct")) { #3519 - x = unclass(x) - lower = unclass(lower) - upper = unclass(upper) + try_posix_cast <- function(x, tz) { + x_time = as.POSIXct(x, tz = tz) + if (is.na(x_time)) return(x) + return(x_time) + } + if (inherits(x, "POSIXct")) { + # allow fast between on POSIX vs POSIX-as-string in some circumstances + # (biggest worry is hard-to-predict timezone mismatch) + if (!is.null(tz <- attr(x, 'tzone')) && (is.character(lower) || is.character(upper))) { + lower = try_posix_cast(lower, tz) + upper = try_posix_cast(upper, tz) + } + if (inherits(lower, "POSIXct") && inherits(upper, "POSIXct")) { #3519 + x = unclass(x) + lower = unclass(lower) + upper = unclass(upper) + } } if (is_strictly_numeric(x) && is_strictly_numeric(lower) && is_strictly_numeric(upper)) { # faster parallelised version for int/double. diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 5dbdb6e1fb..66ae919d98 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -14071,9 +14071,20 @@ dn = as.POSIXct('2016-09-18 08:00:00') up = as.POSIXct('2016-09-18 09:00:00') test(2028.11, between(x, dn, up, verbose=TRUE), output="between parallel processing of double using closed bounds with recycling took") test(2028.12, between(x, dn, up, incbounds=FALSE, verbose=TRUE), output="between parallel processing of double using open bounds with recycling took") + +## also handling of string lower/upper bounds _only when x has a time zone_ +dn = as.character(dn) +up = as.character(up) +test(2028.13, between(x, dn, up, verbose = TRUE), output = 'optimised between not available') +attr(x, 'tzone') = 'UTC' +test(2028.14, between(x, dn, up, verbose = TRUE), output = 'between parallel processing of double') +### additional flexibility -- cast when one bound is already POSIXct +up = as.POSIXct(up) +test(2028.15, between(x, dn, up, verbose = TRUE), output = 'between parallel processing of double') + # `between` support `.` in RHS #2315 X = data.table(a = 1:5, b = 6:10, c = c(5:1)) -test(2028.13, X[c %between% list(a, b)], X[c %between% .(a, b)]) +test(2028.16, X[c %between% list(a, b)], X[c %between% .(a, b)]) ################################### From 8fd75718a567e9ce48de72a1d532801278d84e47 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Wed, 24 Apr 2019 10:43:17 +0800 Subject: [PATCH 07/22] remove LHS=character restriction --- R/between.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/between.R b/R/between.R index 347b9fbd51..eac55167a8 100644 --- a/R/between.R +++ b/R/between.R @@ -10,9 +10,9 @@ between <- function(x,lower,upper,incbounds=TRUE,verbose=getOption("datatable.ve return(x_time) } if (inherits(x, "POSIXct")) { - # allow fast between on POSIX vs POSIX-as-string in some circumstances + # allow fast between on POSIX vs non-explicit-POSIX in some circumstances # (biggest worry is hard-to-predict timezone mismatch) - if (!is.null(tz <- attr(x, 'tzone')) && (is.character(lower) || is.character(upper))) { + if (!is.null(tz <- attr(x, 'tzone'))) { lower = try_posix_cast(lower, tz) upper = try_posix_cast(upper, tz) } From ed0cc29a8fa21ced9ac5467074264721f904a0da Mon Sep 17 00:00:00 2001 From: mattdowle Date: Fri, 26 Apr 2019 16:26:24 -0700 Subject: [PATCH 08/22] GetVerbose() added at C level and used instead of needing to pass through; consistent with inrange --- R/between.R | 8 ++++---- inst/tests/tests.Rraw | 26 +++++++++++++------------- man/between.Rd | 3 +-- src/between.c | 25 ++++++++++++------------- src/data.table.h | 4 +++- src/init.c | 7 +++++++ 6 files changed, 40 insertions(+), 33 deletions(-) diff --git a/R/between.R b/R/between.R index eac55167a8..9b110dbba6 100644 --- a/R/between.R +++ b/R/between.R @@ -1,5 +1,5 @@ # is x[i] in between lower[i] and upper[i] ? -between <- function(x,lower,upper,incbounds=TRUE,verbose=getOption("datatable.verbose")) { +between <- function(x,lower,upper,incbounds=TRUE) { if (is.logical(x)) stop("between has been x of type logical") if (is.logical(lower)) lower = as.integer(lower) # typically NA (which is logical type) if (is.logical(upper)) upper = as.integer(upper) # typically NA (which is logical type) @@ -26,9 +26,9 @@ between <- function(x,lower,upper,incbounds=TRUE,verbose=getOption("datatable.ve # faster parallelised version for int/double. # Cbetween supports length(lower)==1 (recycled) and (from v1.12.0) length(lower)==length(x). # length(upper) can be 1 or length(x) independently of lower - .Call(Cbetween, x, lower, upper, incbounds, verbose) + .Call(Cbetween, x, lower, upper, incbounds) } else { - if (isTRUE(verbose)) cat("optimised between not available for this data type, fallback to slow R routine\n") + if (isTRUE(getOption("datatable.verbose"))) cat("optimised between not available for this data type, fallback to slow R routine\n") # now just for character input. TODO: support character between in Cbetween and remove this branch if (incbounds) x>=lower & x<=upper else x>lower & x=,<= and >,< - verbose = getOption("datatable.verbose") + verbose = isTRUE(getOption("datatable.verbose")) if (verbose) {last.started.at=proc.time();cat("forderv(query) took ... ");flush.console()} xo = forderv(query) if (verbose) {cat(timetaken(last.started.at),"\n"); flush.console()} diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 847b353a5d..38b575f0e9 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -14132,32 +14132,32 @@ test(2030.18, .Last.updated, 0L) # zero match # `between` invalid args, and verbose #3516 test(2031.01, between(1:5, 2, 4, incbounds=423), error="incbounds must be logical") test(2031.02, between(1:5, 2, 4, incbounds=NA), error="incbounds must be logical") -test(2031.03, between(1:5, 2, 4, verbose=423), error="verbose must be logical") -test(2031.04, between(1:5, 2, 4, verbose=NA), error="verbose must be logical") -test(2031.05, between(1:5, 2L, 4L, verbose=TRUE), output="between parallel processing of integer with recycling took") -test(2031.06, between(1:5, rep(2L,5L), rep(4L, 5L), verbose=TRUE), output="between parallel processing of integer took") -test(2031.07, between(as.double(1:5), 2, 4, incbounds=FALSE, verbose=TRUE), output="between parallel processing of double using open bounds with recycling took") -test(2031.08, between(as.double(1:5), 2, 4, verbose=TRUE), output="between parallel processing of double using closed bounds with recycling took") -test(2031.09, between(as.double(1:5), rep(2, 5L), rep(4, 5L), verbose=TRUE), output="between parallel processing of double took") -test(2031.10, between(c("foo","bar","paz"), "bag", "fog", verbose=TRUE), output="optimised between not available for this data type, fallback to slow R routine") +old = options(datatable.verbose=TRUE) +test(2031.05, between(1:5, 2L, 4L), output="between parallel processing of integer with recycling took") +test(2031.06, between(1:5, rep(2L,5L), rep(4L, 5L)), output="between parallel processing of integer took") +test(2031.07, between(as.double(1:5), 2, 4, incbounds=FALSE), output="between parallel processing of double using open bounds with recycling took") +test(2031.08, between(as.double(1:5), 2, 4), output="between parallel processing of double using closed bounds with recycling took") +test(2031.09, between(as.double(1:5), rep(2, 5L), rep(4, 5L)), output="between parallel processing of double took") +test(2031.10, between(c("foo","bar","paz"), "bag", "fog"), output="optimised between not available for this data type, fallback to slow R routine") # `between` handle POSIXct type x = as.POSIXct("2016-09-18 07:00:00") + 0:10*60*15 dn = as.POSIXct('2016-09-18 08:00:00') up = as.POSIXct('2016-09-18 09:00:00') -test(2031.11, between(x, dn, up, verbose=TRUE), output="between parallel processing of double using closed bounds with recycling took") -test(2031.12, between(x, dn, up, incbounds=FALSE, verbose=TRUE), output="between parallel processing of double using open bounds with recycling took") +test(2031.11, between(x, dn, up), output="between parallel processing of double using closed bounds with recycling took") +test(2031.12, between(x, dn, up, incbounds=FALSE), output="between parallel processing of double using open bounds with recycling took") ## also handling of string lower/upper bounds _only when x has a time zone_ dn = as.character(dn) up = as.character(up) -test(2031.13, between(x, dn, up, verbose = TRUE), output = 'optimised between not available') +test(2031.13, between(x, dn, up), output = 'optimised between not available') attr(x, 'tzone') = 'UTC' -test(2031.14, between(x, dn, up, verbose = TRUE), output = 'between parallel processing of double') +test(2031.14, between(x, dn, up), output = 'between parallel processing of double') ### additional flexibility -- cast when one bound is already POSIXct up = as.POSIXct(up) -test(2031.15, between(x, dn, up, verbose = TRUE), output = 'between parallel processing of double') +test(2031.15, between(x, dn, up), output = 'between parallel processing of double') # `between` support `.` in RHS #2315 X = data.table(a = 1:5, b = 6:10, c = c(5:1)) test(2031.16, X[c %between% list(a, b)], X[c %between% .(a, b)]) +options(old) ################################### diff --git a/man/between.Rd b/man/between.Rd index cbf42b58d3..f740d0983a 100644 --- a/man/between.Rd +++ b/man/between.Rd @@ -15,7 +15,7 @@ Intended for use in \code{i} in \code{[.data.table}. the intervals provided in \code{lower,upper}. } \usage{ -between(x, lower, upper, incbounds=TRUE, verbose=getOption("datatable.verbose")) +between(x, lower, upper, incbounds=TRUE) x \%between\% y inrange(x, lower, upper, incbounds=TRUE) @@ -32,7 +32,6 @@ interpreted as \code{lower} and \code{y[[2]]} as \code{upper}.} \item{incbounds}{ \code{TRUE} means inclusive bounds, i.e., [lower,upper]. \code{FALSE} means exclusive bounds, i.e., (lower,upper). It is set to \code{TRUE} by default for infix notations.} -\item{verbose}{ \code{TRUE} turns on status and information messages to the console.} } \details{ diff --git a/src/between.c b/src/between.c index 24021ed036..e8d1bf045c 100644 --- a/src/between.c +++ b/src/between.c @@ -1,6 +1,6 @@ #include "data.table.h" -SEXP between(SEXP x, SEXP lower, SEXP upper, SEXP bounds, SEXP verbose) { +SEXP between(SEXP x, SEXP lower, SEXP upper, SEXP bounds) { R_len_t nx = length(x), nl = length(lower), nu = length(upper); double tic=0.0; @@ -14,8 +14,7 @@ SEXP between(SEXP x, SEXP lower, SEXP upper, SEXP bounds, SEXP verbose) { } if (!isLogical(bounds) || LOGICAL(bounds)[0] == NA_LOGICAL) error("incbounds must be logical TRUE/FALSE."); - if (!isLogical(verbose) || LOGICAL(verbose)[0] == NA_LOGICAL) - error("verbose must be logical TRUE/FALSE."); + const bool verbose = GetVerbose(); int nprotect = 0; bool integer=true; @@ -44,19 +43,19 @@ SEXP between(SEXP x, SEXP lower, SEXP upper, SEXP bounds, SEXP verbose) { if (!recycleX && recycleLow && recycleUpp) { const int l = lp[0] + open; // +open so we can always use >= and <=. NA_INTEGER+1 == -INT_MAX == INT_MIN+1 (so NA limit handled by this too) const int u = up[0]==NA_INTEGER ? INT_MAX : up[0] - open; - if (LOGICAL(verbose)[0]) tic = omp_get_wtime(); + if (verbose) tic = omp_get_wtime(); #pragma omp parallel for num_threads(getDTthreads()) for (int i=0; i Date: Wed, 1 May 2019 15:54:31 +0200 Subject: [PATCH 09/22] between posix from char more robust, remove unclass --- R/between.R | 46 +++++++++++++++++++++++++++---------------- inst/tests/tests.Rraw | 11 ++++++++--- 2 files changed, 37 insertions(+), 20 deletions(-) diff --git a/R/between.R b/R/between.R index 9b110dbba6..11e8af61d1 100644 --- a/R/between.R +++ b/R/between.R @@ -3,26 +3,38 @@ between <- function(x,lower,upper,incbounds=TRUE) { if (is.logical(x)) stop("between has been x of type logical") if (is.logical(lower)) lower = as.integer(lower) # typically NA (which is logical type) if (is.logical(upper)) upper = as.integer(upper) # typically NA (which is logical type) - is_strictly_numeric <- function(x) is.numeric(x) && !inherits(x, "integer64") - try_posix_cast <- function(x, tz) { - x_time = as.POSIXct(x, tz = tz) - if (is.na(x_time)) return(x) - return(x_time) - } - if (inherits(x, "POSIXct")) { - # allow fast between on POSIX vs non-explicit-POSIX in some circumstances - # (biggest worry is hard-to-predict timezone mismatch) - if (!is.null(tz <- attr(x, 'tzone'))) { - lower = try_posix_cast(lower, tz) - upper = try_posix_cast(upper, tz) + is.px = function(x) inherits(x, "POSIXct") + # POSIX special handling to auto coerce character + if (is.px(x) && !is.null(tz<-attr(x, "tzone", TRUE)) && nzchar(tz) && + (is.character(lower) || is.character(upper))) { + try_posix_cast <- function(x, tz) tryCatch( + list(status=0L, value=as.POSIXct(x, tz = tz)), + error = function(e) list(status=1L, value=NULL, message=e[["message"]]) + ) + if (is.character(lower)) { + ans = try_posix_cast(lower, tz) + if (ans$status==0L) lower = ans$value + else stop("'between' function the 'x' argument is a POSIX class while 'lower' was not, coercion to POSIX failed with:", ans$message) + } + if (is.character(upper)) { + ans = try_posix_cast(upper, tz) + if (ans$status==0L) upper = ans$value + else stop("'between' function the 'x' argument is a POSIX class while 'upper' was not, coercion to POSIX failed with:", ans$message) } - if (inherits(lower, "POSIXct") && inherits(upper, "POSIXct")) { #3519 - x = unclass(x) - lower = unclass(lower) - upper = unclass(upper) + stopifnot(is.px(x), is.px(lower), is.px(upper)) # nocov # internal + } + # POSIX check time zone match + if (is.px(x) && is.px(lower) && is.px(upper)) { + tz_match = function(x, y, z) ( # NULL match "", else all identical + ((is.null(x) || !nzchar(x)) && (is.null(y) || !nzchar(y)) && (is.null(z) || !nzchar(z))) + || (identical(x, y) && identical(x, z)) + ) + if (!tz_match(attr(x, "tzone", TRUE), attr(lower, "tzone", TRUE), attr(upper, "tzone", TRUE))) { + stop("'between' function arguments have mismatched timezone attribute, align all arguments to same timezone") } } - if (is_strictly_numeric(x) && is_strictly_numeric(lower) && is_strictly_numeric(upper)) { + is.supported = function(x) is.integer(x) || (storage.mode(x)=="double" && !inherits(x, "integer64")) + if (is.supported(x) && is.supported(lower) && is.supported(upper)) { # faster parallelised version for int/double. # Cbetween supports length(lower)==1 (recycled) and (from v1.12.0) length(lower)==length(x). # length(upper) can be 1 or length(x) independently of lower diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 38b575f0e9..12ee2e62d0 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -14139,21 +14139,26 @@ test(2031.07, between(as.double(1:5), 2, 4, incbounds=FALSE), output="between pa test(2031.08, between(as.double(1:5), 2, 4), output="between parallel processing of double using closed bounds with recycling took") test(2031.09, between(as.double(1:5), rep(2, 5L), rep(4, 5L)), output="between parallel processing of double took") test(2031.10, between(c("foo","bar","paz"), "bag", "fog"), output="optimised between not available for this data type, fallback to slow R routine") + # `between` handle POSIXct type x = as.POSIXct("2016-09-18 07:00:00") + 0:10*60*15 dn = as.POSIXct('2016-09-18 08:00:00') up = as.POSIXct('2016-09-18 09:00:00') test(2031.11, between(x, dn, up), output="between parallel processing of double using closed bounds with recycling took") test(2031.12, between(x, dn, up, incbounds=FALSE), output="between parallel processing of double using open bounds with recycling took") + ## also handling of string lower/upper bounds _only when x has a time zone_ -dn = as.character(dn) -up = as.character(up) +x = as.POSIXct("2016-09-18 07:00:00") + 0:10*60*15 +dn = '2016-09-18 08:00:00' +up = '2016-09-18 09:00:00' test(2031.13, between(x, dn, up), output = 'optimised between not available') attr(x, 'tzone') = 'UTC' test(2031.14, between(x, dn, up), output = 'between parallel processing of double') + ### additional flexibility -- cast when one bound is already POSIXct -up = as.POSIXct(up) +up = as.POSIXct(up, tz="UTC") test(2031.15, between(x, dn, up), output = 'between parallel processing of double') + # `between` support `.` in RHS #2315 X = data.table(a = 1:5, b = 6:10, c = c(5:1)) test(2031.16, X[c %between% list(a, b)], X[c %between% .(a, b)]) From e25b0f4e1aeb57479619acf2fc6df695e3d90501 Mon Sep 17 00:00:00 2001 From: jangorecki Date: Wed, 1 May 2019 16:02:43 +0200 Subject: [PATCH 10/22] more strict condition to redirect to Cbetween --- R/between.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/between.R b/R/between.R index 11e8af61d1..f1ef322ccf 100644 --- a/R/between.R +++ b/R/between.R @@ -33,7 +33,7 @@ between <- function(x,lower,upper,incbounds=TRUE) { stop("'between' function arguments have mismatched timezone attribute, align all arguments to same timezone") } } - is.supported = function(x) is.integer(x) || (storage.mode(x)=="double" && !inherits(x, "integer64")) + is.supported = function(x) (is.numeric(x) && !inherits(x, "integer64")) || is.px(x) if (is.supported(x) && is.supported(lower) && is.supported(upper)) { # faster parallelised version for int/double. # Cbetween supports length(lower)==1 (recycled) and (from v1.12.0) length(lower)==length(x). From 1259b6643975f026ebdfb128101cf1ac6beb8ff1 Mon Sep 17 00:00:00 2001 From: jangorecki Date: Wed, 1 May 2019 16:41:18 +0200 Subject: [PATCH 11/22] between unit tests, minor reformat --- R/between.R | 16 ++++++++-------- inst/tests/tests.Rraw | 22 ++++++++++++++++++---- 2 files changed, 26 insertions(+), 12 deletions(-) diff --git a/R/between.R b/R/between.R index f1ef322ccf..1911e01248 100644 --- a/R/between.R +++ b/R/between.R @@ -7,28 +7,28 @@ between <- function(x,lower,upper,incbounds=TRUE) { # POSIX special handling to auto coerce character if (is.px(x) && !is.null(tz<-attr(x, "tzone", TRUE)) && nzchar(tz) && (is.character(lower) || is.character(upper))) { - try_posix_cast <- function(x, tz) tryCatch( + try_posix_cast <- function(x, tz) {tryCatch( list(status=0L, value=as.POSIXct(x, tz = tz)), error = function(e) list(status=1L, value=NULL, message=e[["message"]]) - ) + )} if (is.character(lower)) { ans = try_posix_cast(lower, tz) if (ans$status==0L) lower = ans$value - else stop("'between' function the 'x' argument is a POSIX class while 'lower' was not, coercion to POSIX failed with:", ans$message) + else stop("'between' function the 'x' argument is a POSIX class while 'lower' was not, coercion to POSIX failed with: ", ans$message) } if (is.character(upper)) { ans = try_posix_cast(upper, tz) if (ans$status==0L) upper = ans$value - else stop("'between' function the 'x' argument is a POSIX class while 'upper' was not, coercion to POSIX failed with:", ans$message) + else stop("'between' function the 'x' argument is a POSIX class while 'upper' was not, coercion to POSIX failed with: ", ans$message) } stopifnot(is.px(x), is.px(lower), is.px(upper)) # nocov # internal } # POSIX check time zone match if (is.px(x) && is.px(lower) && is.px(upper)) { - tz_match = function(x, y, z) ( # NULL match "", else all identical - ((is.null(x) || !nzchar(x)) && (is.null(y) || !nzchar(y)) && (is.null(z) || !nzchar(z))) - || (identical(x, y) && identical(x, z)) - ) + tz_match = function(x, y, z) { # NULL match "", else all identical + ((is.null(x) || !nzchar(x)) && (is.null(y) || !nzchar(y)) && (is.null(z) || !nzchar(z))) || + (identical(x, y) && identical(x, z)) + } if (!tz_match(attr(x, "tzone", TRUE), attr(lower, "tzone", TRUE), attr(upper, "tzone", TRUE))) { stop("'between' function arguments have mismatched timezone attribute, align all arguments to same timezone") } diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 497dc2937f..dd794a225e 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -14151,7 +14151,7 @@ up = as.POSIXct('2016-09-18 09:00:00') test(2032.11, between(x, dn, up), output="between parallel processing of double using closed bounds with recycling took") test(2032.12, between(x, dn, up, incbounds=FALSE), output="between parallel processing of double using open bounds with recycling took") -## also handling of string lower/upper bounds _only when x has a time zone_ +# also handling of string lower/upper bounds _only when x has a time zone_ x = as.POSIXct("2016-09-18 07:00:00") + 0:10*60*15 dn = '2016-09-18 08:00:00' up = '2016-09-18 09:00:00' @@ -14159,14 +14159,28 @@ test(2032.13, between(x, dn, up), output = 'optimised between not available') attr(x, 'tzone') = 'UTC' test(2032.14, between(x, dn, up), output = 'between parallel processing of double') -### additional flexibility -- cast when one bound is already POSIXct +# additional flexibility -- cast when one bound is already POSIXct up = as.POSIXct(up, tz="UTC") test(2032.15, between(x, dn, up), output = 'between parallel processing of double') +options(old) + +# exceptions in char to POSIX coercion +dn = 'aa2016-09-18 08:00:00' +test(2032.16, between(x, dn, up), error="coercion to POSIX failed") +dn = '2016-09-18 08:00:00' +up = 'bb2016-09-18 09:00:00' +test(2032.17, between(x, dn, up), error="coercion to POSIX failed") + +# exceptions due to timezone mismatch +x = as.POSIXct("2016-09-18 07:00:00", tz="UTC") + 0:10*60*15 +dn = as.POSIXct('2016-09-18 08:00:00') +up = '2016-09-18 09:00:00' +test(2032.18, between(x, dn, up), error="mismatched timezone attribute") # `between` support `.` in RHS #2315 X = data.table(a = 1:5, b = 6:10, c = c(5:1)) -test(2032.16, X[c %between% list(a, b)], X[c %between% .(a, b)]) -options(old) +test(2032.19, X[c %between% list(a, b)], X[c %between% .(a, b)]) + ################################### From aa175aa1233f1e52701c7a752bdc726a87822fcd Mon Sep 17 00:00:00 2001 From: jangorecki Date: Wed, 1 May 2019 18:34:25 +0200 Subject: [PATCH 12/22] between int64 support and num-to-int coerce, #3517 --- R/between.R | 13 ++++++-- inst/tests/tests.Rraw | 14 +++++++++ src/between.c | 73 ++++++++++++++++++++++++++++++++++++------- 3 files changed, 87 insertions(+), 13 deletions(-) diff --git a/R/between.R b/R/between.R index 1911e01248..81cc55ef01 100644 --- a/R/between.R +++ b/R/between.R @@ -4,6 +4,7 @@ between <- function(x,lower,upper,incbounds=TRUE) { if (is.logical(lower)) lower = as.integer(lower) # typically NA (which is logical type) if (is.logical(upper)) upper = as.integer(upper) # typically NA (which is logical type) is.px = function(x) inherits(x, "POSIXct") + is.i64 = function(x) inherits(x, "integer64") # POSIX special handling to auto coerce character if (is.px(x) && !is.null(tz<-attr(x, "tzone", TRUE)) && nzchar(tz) && (is.character(lower) || is.character(upper))) { @@ -23,7 +24,7 @@ between <- function(x,lower,upper,incbounds=TRUE) { } stopifnot(is.px(x), is.px(lower), is.px(upper)) # nocov # internal } - # POSIX check time zone match + # POSIX check timezone match if (is.px(x) && is.px(lower) && is.px(upper)) { tz_match = function(x, y, z) { # NULL match "", else all identical ((is.null(x) || !nzchar(x)) && (is.null(y) || !nzchar(y)) && (is.null(z) || !nzchar(z))) || @@ -33,7 +34,15 @@ between <- function(x,lower,upper,incbounds=TRUE) { stop("'between' function arguments have mismatched timezone attribute, align all arguments to same timezone") } } - is.supported = function(x) (is.numeric(x) && !inherits(x, "integer64")) || is.px(x) + # int64 + if (is.i64(x)) { + if (!requireNamespace("bit64", quietly=TRUE)) stop("trying to use integer64 class when 'bit64' package is not installed") + if (!is.i64(lower) && is.numeric(lower)) lower = bit64::as.integer64(lower) + if (!is.i64(upper) && is.numeric(upper)) upper = bit64::as.integer64(upper) + } else if (is.i64(lower) || is.i64(upper)) { + stop("'lower' and/or 'upper' are integer64 class while 'x' argument is not, align classes before passing to 'between'") + } + is.supported = function(x) is.numeric(x) || is.px(x) if (is.supported(x) && is.supported(lower) && is.supported(upper)) { # faster parallelised version for int/double. # Cbetween supports length(lower)==1 (recycled) and (from v1.12.0) length(lower)==length(x). diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index dd794a225e..7acdb6e712 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -14181,6 +14181,20 @@ test(2032.18, between(x, dn, up), error="mismatched timezone attribute") X = data.table(a = 1:5, b = 6:10, c = c(5:1)) test(2032.19, X[c %between% list(a, b)], X[c %between% .(a, b)]) +# between num to int coercion #3517 +old = options("datatable.verbose"=TRUE) +#TODO +options(old) + +# between int64 support +if (test_bit64) { + as.i64 = bit64::as.integer64 + test(2032.51, between(1:10, as.i64(3), as.i64(6)), error="are integer64 class while 'x' argument is not") + test(2032.52, between(1:10, 3, as.i64(6)), error="are integer64 class while 'x' argument is not") + old = options("datatable.verbose"=TRUE) + #TODO + options(old) +} ################################### diff --git a/src/between.c b/src/between.c index e8d1bf045c..58de4bed62 100644 --- a/src/between.c +++ b/src/between.c @@ -1,5 +1,8 @@ #include "data.table.h" +#define NA_INTEGER64 LLONG_MIN +#define MAX_INTEGER64 LLONG_MAX + SEXP between(SEXP x, SEXP lower, SEXP upper, SEXP bounds) { R_len_t nx = length(x), nl = length(lower), nu = length(upper); @@ -18,15 +21,32 @@ SEXP between(SEXP x, SEXP lower, SEXP upper, SEXP bounds) { int nprotect = 0; bool integer=true; - if (isReal(x) || isReal(lower) || isReal(upper)) { - if (inherits(x,"integer64") || inherits(lower,"integer64") || inherits(upper,"integer64")) { - error("Internal error: one or more of x, lower and upper is type integer64 but this should have been caught by between() at R level."); // # nocov + bool integer64=false; + if (isInteger(x) && // #3517 coerce to num to int when possible + (isInteger(lower) || (!isInteger(lower) && !isReallyReal(lower))) && + (isInteger(upper) || (!isInteger(upper) && !isReallyReal(upper)))) { + if (!isInteger(lower)) { + lower = PROTECT(coerceVector(lower, INTSXP)); nprotect++; + } + if (!isInteger(upper)) { + upper = PROTECT(coerceVector(upper, INTSXP)); nprotect++; } + } else if (inherits(x,"integer64")) { + if (!inherits(lower,"integer64") || !inherits(upper,"integer64")) + error("Internal error in between: 'x' is integer64 while 'lower' and/or 'upper' are not, should have been catched by now"); // # nocov integer=false; - lower = PROTECT(coerceVector(lower, REALSXP)); // these coerces will convert NA appropriately - upper = PROTECT(coerceVector(upper, REALSXP)); - x = PROTECT(coerceVector(x, REALSXP)); - nprotect += 3; + integer64=true; + } else if (isReal(x) || isReal(lower) || isReal(upper)) { + integer=false; + if (!isReal(x)) { + x = PROTECT(coerceVector(x, REALSXP)); nprotect++; + } + if (!isReal(lower)) { + lower = PROTECT(coerceVector(lower, REALSXP)); nprotect++; // these coerces will convert NA appropriately + } + if (!isReal(upper)) { + upper = PROTECT(coerceVector(upper, REALSXP)); nprotect++; + } } // TODO: sweep through lower and upper ensuring lower<=upper (inc bounds) and no lower>upper or lower==INT_MAX @@ -52,9 +72,9 @@ SEXP between(SEXP x, SEXP lower, SEXP upper, SEXP bounds) { if (verbose) Rprintf("between parallel processing of integer with recycling took %8.3fs\n", omp_get_wtime()-tic); } else { - const int xMask = recycleX ? 0 : INT_MAX; - const int lowMask = recycleLow ? 0 : INT_MAX; - const int uppMask = recycleUpp ? 0 : INT_MAX; + const int64_t xMask = recycleX ? 0 : INT_MAX; + const int64_t lowMask = recycleLow ? 0 : INT_MAX; + const int64_t uppMask = recycleUpp ? 0 : INT_MAX; if (verbose) tic = omp_get_wtime(); #pragma omp parallel for num_threads(getDTthreads()) for (int i=0; i Date: Wed, 1 May 2019 18:42:05 +0200 Subject: [PATCH 13/22] update news file for more changes in between function --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 55aff2088f..5fff6fe6ed 100644 --- a/NEWS.md +++ b/NEWS.md @@ -48,7 +48,7 @@ 7. New variable `.Last.updated` (similar to R's `.Last.value`) contains the number of rows affected by the most recent `:=` or `set()`, [#1885](https://github.com/Rdatatable/data.table/issues/1885). -8. `between()` and `%between%` are faster for `POSIXct`, [#3519](https://github.com/Rdatatable/data.table/issues/3519), and now support the `.()` alias, [#2315](https://github.com/Rdatatable/data.table/issues/2315). Thanks to @Henrik-P for the reports. +8. `between()` and `%between%` are faster for `POSIXct`, [#3519](https://github.com/Rdatatable/data.table/issues/3519), and now support the `.()` alias, [#2315](https://github.com/Rdatatable/data.table/issues/2315). Thanks to @Henrik-P for the reports. There is now also support for `bit64`'s `integer64` class and more robust coercion of types, [#3517](https://github.com/Rdatatable/data.table/issues/3517). #### BUG FIXES From 102c6db250c8532c46d8c4a786b18a941d8b2816 Mon Sep 17 00:00:00 2001 From: jangorecki Date: Wed, 1 May 2019 18:46:43 +0200 Subject: [PATCH 14/22] revert mistakenly altered branch of int32 --- src/between.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/between.c b/src/between.c index 58de4bed62..dc2deecf93 100644 --- a/src/between.c +++ b/src/between.c @@ -72,9 +72,9 @@ SEXP between(SEXP x, SEXP lower, SEXP upper, SEXP bounds) { if (verbose) Rprintf("between parallel processing of integer with recycling took %8.3fs\n", omp_get_wtime()-tic); } else { - const int64_t xMask = recycleX ? 0 : INT_MAX; - const int64_t lowMask = recycleLow ? 0 : INT_MAX; - const int64_t uppMask = recycleUpp ? 0 : INT_MAX; + const int xMask = recycleX ? 0 : INT_MAX; + const int lowMask = recycleLow ? 0 : INT_MAX; + const int uppMask = recycleUpp ? 0 : INT_MAX; if (verbose) tic = omp_get_wtime(); #pragma omp parallel for num_threads(getDTthreads()) for (int i=0; i Date: Wed, 1 May 2019 19:12:48 +0200 Subject: [PATCH 15/22] fixes and tests towards #3517, speed up isReallyReal --- inst/tests/tests.Rraw | 4 +++- src/between.c | 17 +++++++++++++++-- src/forder.c | 5 +++-- 3 files changed, 21 insertions(+), 5 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 7acdb6e712..dd3e239a9b 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -14183,7 +14183,9 @@ test(2032.19, X[c %between% list(a, b)], X[c %between% .(a, b)]) # between num to int coercion #3517 old = options("datatable.verbose"=TRUE) -#TODO +test(2032.20, between(1:5, 2, 4), output="between parallel processing of integer") +test(2032.21, between(1:5, 2L, 4), output="between parallel processing of integer") +test(2032.22, between(1:5, 2, 4L), output="between parallel processing of integer") options(old) # between int64 support diff --git a/src/between.c b/src/between.c index dc2deecf93..fb890afa12 100644 --- a/src/between.c +++ b/src/between.c @@ -3,6 +3,18 @@ #define NA_INTEGER64 LLONG_MIN #define MAX_INTEGER64 LLONG_MAX +bool isReallyRealC(SEXP x) { + if (!isReal(x)) return(false); + R_xlen_t n=xlength(x), i=0; + double *dx = REAL(x); + while (i Date: Wed, 1 May 2019 19:17:53 +0200 Subject: [PATCH 16/22] remove debug print --- src/between.c | 1 - 1 file changed, 1 deletion(-) diff --git a/src/between.c b/src/between.c index fb890afa12..674e4f9036 100644 --- a/src/between.c +++ b/src/between.c @@ -34,7 +34,6 @@ SEXP between(SEXP x, SEXP lower, SEXP upper, SEXP bounds) { int nprotect = 0; bool integer=true; bool integer64=false; - Rprintf("int(x)=%d, int(lower)=%d, rreal(lower)=%d, int(upper)=%d, rreal(upper)=%d\n", isInteger(x), isInteger(lower), isReallyRealC(lower), isReallyRealC(upper), isReallyRealC(upper)); if (isInteger(x) && // #3517 coerce to num to int when possible (isInteger(lower) || (!isInteger(lower) && !isReallyRealC(lower))) && (isInteger(upper) || (!isInteger(upper) && !isReallyRealC(upper)))) { From 6ed0d69b003b82721a60a5c6d42b556e5b0dbea5 Mon Sep 17 00:00:00 2001 From: jangorecki Date: Wed, 1 May 2019 19:22:56 +0200 Subject: [PATCH 17/22] typo fix --- src/between.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/between.c b/src/between.c index 674e4f9036..81f5a9d29c 100644 --- a/src/between.c +++ b/src/between.c @@ -154,7 +154,7 @@ SEXP between(SEXP x, SEXP lower, SEXP upper, SEXP bounds) { int64_t elem = xp[i]; ansp[i] = elem==NA_INTEGER64 ? NA_LOGICAL : (l<=elem && elem<=u); } - if (verbose) Rprintf("between parallel processing of intege64r with recycling took %8.3fs\n", omp_get_wtime()-tic); + if (verbose) Rprintf("between parallel processing of integer64 with recycling took %8.3fs\n", omp_get_wtime()-tic); } else { const int xMask = recycleX ? 0 : INT_MAX; From 12f1d2b9079fa36460b23e049223d1513661c296 Mon Sep 17 00:00:00 2001 From: jangorecki Date: Wed, 1 May 2019 19:29:43 +0200 Subject: [PATCH 18/22] between int64 unit tests --- inst/tests/tests.Rraw | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index dd3e239a9b..c0849cd725 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -14183,9 +14183,9 @@ test(2032.19, X[c %between% list(a, b)], X[c %between% .(a, b)]) # between num to int coercion #3517 old = options("datatable.verbose"=TRUE) -test(2032.20, between(1:5, 2, 4), output="between parallel processing of integer") -test(2032.21, between(1:5, 2L, 4), output="between parallel processing of integer") -test(2032.22, between(1:5, 2, 4L), output="between parallel processing of integer") +test(2032.20, between(1:5, 2, 4), output="between parallel processing of integer with recycling") +test(2032.21, between(1:5, 2L, 4), output="between parallel processing of integer with recycling") +test(2032.22, between(1:5, 2, 4L), output="between parallel processing of integer with recycling") options(old) # between int64 support @@ -14194,7 +14194,20 @@ if (test_bit64) { test(2032.51, between(1:10, as.i64(3), as.i64(6)), error="are integer64 class while 'x' argument is not") test(2032.52, between(1:10, 3, as.i64(6)), error="are integer64 class while 'x' argument is not") old = options("datatable.verbose"=TRUE) - #TODO + ans36 = c(FALSE,FALSE,TRUE,TRUE,TRUE,TRUE,FALSE,FALSE,FALSE,FALSE) + ans36open = c(FALSE,FALSE,FALSE,TRUE,TRUE,FALSE,FALSE,FALSE,FALSE,FALSE) + test(2032.53, between(as.i64(1:10), 3, 6), ans36, output="between parallel processing of integer64 with recycling") + test(2032.54, between(as.i64(1:10), 3, 6, incbounds=FALSE), ans36open, output="between parallel processing of integer64 with recycling") + test(2032.55, between(as.i64(1:10), 3L, 6), ans36, output="between parallel processing of integer64 with recycling") + test(2032.56, between(as.i64(1:10), 3L, 6, incbounds=FALSE), ans36open, output="between parallel processing of integer64 with recycling") + test(2032.57, between(as.i64(1:10), 3, 6L), ans36, output="between parallel processing of integer64 with recycling") + test(2032.58, between(as.i64(1:10), 3, 6L, incbounds=FALSE), ans36open, output="between parallel processing of integer64 with recycling") + test(2032.59, between(as.i64(1:10), 3L, 6L), ans36, output="between parallel processing of integer64 with recycling") + test(2032.60, between(as.i64(1:10), 3L, 6L, incbounds=FALSE), ans36open, output="between parallel processing of integer64 with recycling") + #TODO without recycling + #TODO real int64 + #TODO NA int64 in x/lower/upper + #TODO NA int64 in lower/upper without recycling options(old) } From 1548cce73e7f811e527b419682a139e6b6edf476 Mon Sep 17 00:00:00 2001 From: jangorecki Date: Thu, 2 May 2019 14:54:49 +0200 Subject: [PATCH 19/22] minor fix to between int64 and more unit tests --- inst/tests/tests.Rraw | 35 +++++++++++++++++++++++------------ src/between.c | 2 +- 2 files changed, 24 insertions(+), 13 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index c0849cd725..bee176d41a 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -14194,20 +14194,31 @@ if (test_bit64) { test(2032.51, between(1:10, as.i64(3), as.i64(6)), error="are integer64 class while 'x' argument is not") test(2032.52, between(1:10, 3, as.i64(6)), error="are integer64 class while 'x' argument is not") old = options("datatable.verbose"=TRUE) + x = as.i64(1:10) ans36 = c(FALSE,FALSE,TRUE,TRUE,TRUE,TRUE,FALSE,FALSE,FALSE,FALSE) ans36open = c(FALSE,FALSE,FALSE,TRUE,TRUE,FALSE,FALSE,FALSE,FALSE,FALSE) - test(2032.53, between(as.i64(1:10), 3, 6), ans36, output="between parallel processing of integer64 with recycling") - test(2032.54, between(as.i64(1:10), 3, 6, incbounds=FALSE), ans36open, output="between parallel processing of integer64 with recycling") - test(2032.55, between(as.i64(1:10), 3L, 6), ans36, output="between parallel processing of integer64 with recycling") - test(2032.56, between(as.i64(1:10), 3L, 6, incbounds=FALSE), ans36open, output="between parallel processing of integer64 with recycling") - test(2032.57, between(as.i64(1:10), 3, 6L), ans36, output="between parallel processing of integer64 with recycling") - test(2032.58, between(as.i64(1:10), 3, 6L, incbounds=FALSE), ans36open, output="between parallel processing of integer64 with recycling") - test(2032.59, between(as.i64(1:10), 3L, 6L), ans36, output="between parallel processing of integer64 with recycling") - test(2032.60, between(as.i64(1:10), 3L, 6L, incbounds=FALSE), ans36open, output="between parallel processing of integer64 with recycling") - #TODO without recycling - #TODO real int64 - #TODO NA int64 in x/lower/upper - #TODO NA int64 in lower/upper without recycling + test(2032.53, between(x, 3, 6), ans36, output="between parallel processing of integer64 with recycling") + test(2032.54, between(x, 3, 6, incbounds=FALSE), ans36open, output="between parallel processing of integer64 with recycling") + test(2032.55, between(x, 3L, 6), ans36, output="between parallel processing of integer64 with recycling") + test(2032.56, between(x, 3L, 6, incbounds=FALSE), ans36open, output="between parallel processing of integer64 with recycling") + test(2032.57, between(x, 3, 6L), ans36, output="between parallel processing of integer64 with recycling") + test(2032.58, between(x, 3, 6L, incbounds=FALSE), ans36open, output="between parallel processing of integer64 with recycling") + test(2032.59, between(x, 3L, 6L), ans36, output="between parallel processing of integer64 with recycling") + test(2032.60, between(x, 3L, 6L, incbounds=FALSE), ans36open, output="between parallel processing of integer64 with recycling") + test(2032.61, between(x, rep(3, 10L), rep(6, 10L)), ans36, output="between parallel processing of integer64 took") + test(2032.62, between(x, rep(3, 10L), rep(6, 10L), incbounds=FALSE), ans36open, output="between parallel processing of integer64 took") + maxint = 2147483647 + test(2032.63, between(x+maxint, 3+maxint, 6+maxint), ans36, output="between parallel processing of integer64 with recycling") + test(2032.64, between(x+maxint, 3+maxint, 6+maxint, incbounds=FALSE), ans36open, output="between parallel processing of integer64 with recycling") + x[5] = NA + ans36[5] = NA + ans36open[5] = NA + test(2032.65, between(x+maxint, 3+maxint, 6+maxint), ans36, output="between parallel processing of integer64 with recycling") + test(2032.66, between(x+maxint, 3+maxint, 6+maxint, incbounds=FALSE), ans36open, output="between parallel processing of integer64 with recycling") + test(2032.67, between(x+maxint, NA, 6+maxint), c(TRUE, TRUE, tail(ans36, -2L)), output="between parallel processing of integer64 with recycling") + test(2032.68, between(x+maxint, 3+maxint, NA, incbounds=FALSE), c(head(ans36open, -5L), rep(TRUE, 5)), output="between parallel processing of integer64 with recycling") + test(2032.69, between(x+maxint, rep(NA, 10L), rep(6+maxint, 10L)), c(TRUE, TRUE, tail(ans36, -2L)), output="between parallel processing of integer64 took") + test(2032.70, between(x+maxint, rep(3+maxint, 10L), rep(NA, 10L), incbounds=FALSE), c(head(ans36open, -5L), rep(TRUE, 5)), output="between parallel processing of integer64 took") options(old) } diff --git a/src/between.c b/src/between.c index 81f5a9d29c..6b66033900 100644 --- a/src/between.c +++ b/src/between.c @@ -164,7 +164,7 @@ SEXP between(SEXP x, SEXP lower, SEXP upper, SEXP bounds) { #pragma omp parallel for num_threads(getDTthreads()) for (int i=0; i Date: Fri, 3 May 2019 13:24:00 +0200 Subject: [PATCH 20/22] between changes codecov --- R/between.R | 2 +- src/between.c | 7 +++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/R/between.R b/R/between.R index 81cc55ef01..a32117f4ff 100644 --- a/R/between.R +++ b/R/between.R @@ -36,7 +36,7 @@ between <- function(x,lower,upper,incbounds=TRUE) { } # int64 if (is.i64(x)) { - if (!requireNamespace("bit64", quietly=TRUE)) stop("trying to use integer64 class when 'bit64' package is not installed") + if (!requireNamespace("bit64", quietly=TRUE)) stop("trying to use integer64 class when 'bit64' package is not installed") # nocov if (!is.i64(lower) && is.numeric(lower)) lower = bit64::as.integer64(lower) if (!is.i64(upper) && is.numeric(upper)) upper = bit64::as.integer64(upper) } else if (is.i64(lower) || is.i64(upper)) { diff --git a/src/between.c b/src/between.c index 6b66033900..a6820d8035 100644 --- a/src/between.c +++ b/src/between.c @@ -48,17 +48,16 @@ SEXP between(SEXP x, SEXP lower, SEXP upper, SEXP bounds) { error("Internal error in between: 'x' is integer64 while 'lower' and/or 'upper' are not, should have been catched by now"); // # nocov integer=false; integer64=true; - } else if (isReal(x) || isReal(lower) || isReal(upper)) { + } else if (isReal(x)) { integer=false; - if (!isReal(x)) { - x = PROTECT(coerceVector(x, REALSXP)); nprotect++; - } if (!isReal(lower)) { lower = PROTECT(coerceVector(lower, REALSXP)); nprotect++; // these coerces will convert NA appropriately } if (!isReal(upper)) { upper = PROTECT(coerceVector(upper, REALSXP)); nprotect++; } + } else { + error("Internal error in between: 'x' is not int, double or int64, should have been catched by now"); // # nocov } // TODO: sweep through lower and upper ensuring lower<=upper (inc bounds) and no lower>upper or lower==INT_MAX From a0bd9b5dedaa0a56bd1b14b4cf38969cfdc34ef5 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Mon, 13 May 2019 14:13:15 -0700 Subject: [PATCH 21/22] LLONG_MIN => INT64_MIN to isolate from possible LLONG variance, and moved to data.table.h --- src/between.c | 3 --- src/data.table.h | 4 ++++ 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/between.c b/src/between.c index a6820d8035..70bd7c3978 100644 --- a/src/between.c +++ b/src/between.c @@ -1,8 +1,5 @@ #include "data.table.h" -#define NA_INTEGER64 LLONG_MIN -#define MAX_INTEGER64 LLONG_MAX - bool isReallyRealC(SEXP x) { if (!isReal(x)) return(false); R_xlen_t n=xlength(x), i=0; diff --git a/src/data.table.h b/src/data.table.h index 0b4632c9e0..7e4751ba0d 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -34,6 +34,10 @@ typedef R_xlen_t RLEN; #endif #define MAX(a,b) (((a)>(b))?(a):(b)) +// for use with bit64::integer64 +#define NA_INTEGER64 INT64_MIN +#define MAX_INTEGER64 INT64_MAX + // Backport macros added to R in 2017 so we don't need to update dependency from R 3.0.0 #ifndef MAYBE_SHARED # define MAYBE_SHARED(x) (NAMED(x) > 1) From 89fa2cf5160cbf977029ba77c37adc64c7f8d1a9 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Mon, 13 May 2019 14:31:10 -0700 Subject: [PATCH 22/22] isReallyRealC => isRealReallyInt to simplify || --- src/between.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/between.c b/src/between.c index 70bd7c3978..361bf70a00 100644 --- a/src/between.c +++ b/src/between.c @@ -1,6 +1,6 @@ #include "data.table.h" -bool isReallyRealC(SEXP x) { +bool isRealReallyInt(SEXP x) { if (!isReal(x)) return(false); R_xlen_t n=xlength(x), i=0; double *dx = REAL(x); @@ -9,7 +9,7 @@ bool isReallyRealC(SEXP x) { ( R_FINITE(dx[i]) && dx[i] == (int)(dx[i])))) { i++; } - return(iupper or lower==INT_MAX @@ -169,6 +169,6 @@ SEXP between(SEXP x, SEXP lower, SEXP upper, SEXP bounds) { } } UNPROTECT(nprotect); - return(ans); + return ans; }